Document Redaction & Censoring Web App with SpaCy

In this tutorial we will be building a document redaction and sanitization web application with flask and spacy. We will be using spaCy’s named entity recognition to help us in our document redaction and censorship.

SpaCy has a very powerful named entity recognition feature that allows us to do some interesting things with it.

This web app has two parts with the following structure

static
templates
app.py

Front-End

We will be using material design bootstrap for designing our front-end.

check full code here with front-end here

Back-End

We will be using python’s flask micro-framework as our back-end.

Below is the code for the backend


from flask import Flask,url_for,render_template,request,send_file,redirect
from flask_uploads import UploadSet,configure_uploads,ALL,DATA
from werkzeug import secure_filename

# Other Packages
import os
import spacy
nlp = spacy.load('en')

import time
timestr = time.strftime("%Y%m%d-%H%M%S")

# Initialize App
app = Flask(__name__)
# Configuration For Uploads
files = UploadSet('files',ALL)
app.config['UPLOADED_FILES_DEST'] = 'static/uploadedfiles'
configure_uploads(app,files)

# Functions to Sanitize and Redact 
def sanitize_names(text):
    docx = nlp(text)
    redacted_sentences = []
    for ent in docx.ents:
        ent.merge()
    for token in docx:
        if token.ent_type_ == 'PERSON':
            redacted_sentences.append("[REDACTED NAME]")
        else:
            redacted_sentences.append(token.string)
    return "".join(redacted_sentences)

def sanitize_places(text):
    docx = nlp(text)
    redacted_sentences = []
    for ent in docx.ents:
        ent.merge()
    for token in docx:
        if token.ent_type_ == 'GPE':
            redacted_sentences.append("[REDACTED PLACE]")
        else:
            redacted_sentences.append(token.string)
    return "".join(redacted_sentences)

def sanitize_date(text):
    docx = nlp(text)
    redacted_sentences = []
    for ent in docx.ents:
        ent.merge()
    for token in docx:
        if token.ent_type_ == 'DATE':
            redacted_sentences.append("[REDACTED DATE]")
        else:
            redacted_sentences.append(token.string)
    return "".join(redacted_sentences)

def sanitize_org(text):
    docx = nlp(text)
    redacted_sentences = []
    for ent in docx.ents:
        ent.merge()
    for token in docx:
        if token.ent_type_ == 'ORG':
            redacted_sentences.append("[REDACTED]")
        else:
            redacted_sentences.append(token.string)
    return "".join(redacted_sentences)

def writetofile(text):
	file_name = 'yourdocument' + timestr + '.txt'
	with open(os.path.join('static/downloadfiles',file_name),'w') as f:
		f.write(text)

		
@app.route('/')
def index():
	return render_template('index.html')

@app.route('/sanitize',methods=['GET','POST'])
def sanitize():
	if request.method == 'POST':
		choice = request.form['taskoption']
		rawtext = request.form['rawtext']
		if choice == 'redact_names':
			result = sanitize_names(rawtext)
		elif choice == 'places':
			result = sanitize_places(rawtext)
		elif choice == 'date':
			result = sanitize_date(rawtext)
		elif choice == 'org':
			result = sanitize_org(rawtext)
		else:
			result = sanitize_names(rawtext)
	return render_template('index.html',rawtext=rawtext,result=result)


@app.route('/uploads',methods=['GET','POST'])
def uploads():
	if request.method == 'POST' and 'txt_data' in request.files:
		file = request.files['txt_data']
		choice = request.form['saveoption']
		filename = secure_filename(file.filename)
		file.save(os.path.join('static/uploadedfiles',filename))

		# Document Redaction Here
		with open(os.path.join('static/uploadedfiles',filename),'r+') as f:
			myfile = f.read()
			result = sanitize_names(myfile)
		if choice == 'savetotxt':
			new_res = writetofile(result)
			return redirect(url_for('downloads'))
		elif choice == 'no_save':
			pass
		else:
			pass

		


	return render_template('result.html',filename=filename,result=result,myfile=myfile)

@app.route('/downloads')
def downloads():
	files = os.listdir(os.path.join('static/downloadfiles'))
	return render_template('downloadsdirectory.html',files=files)

if __name__ == '__main__':
app.run(debug=True)

Below is a video tutorial of the entire process.

Thanks for your time

By Jesse JCharis

Jesus Saves@JCharisTech