This appendix displays the code blocks as described and discussed in Chapter 4, “Computational Thinking in Practice.” Each code block is labeled alphabetically for more convenient reference.
# search.py
# Like in the app.py code, software modules are loaded in to help set up the
# software environment.
# No need to duplicate the code here!
# creates the ResultElement object, containing rank value and filename
app = Flask(__name__)
app.config['UPLOAD_FOLDER'] = 'Original_Resumes/'
app.config['ALLOWED_EXTENSIONS'] = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
class ResultElement:
def __init__(self, rank, filename):
self.rank = rank
self.filename = filename
def allowed_file(filename):
return '.' in filename and
filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
import re, string, unicodedata # software libraries for regular expressions, strings.
import nltk # natural language toolkit software library
import contractions # software library to handle English's contraction structure
import inflect
# software library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
# software library for identifying words (tokens) in a stream of text
from nltk import word_tokenize, sent_tokenize
# software library that lists stop words for each language
from nltk.corpus import stopwords
# software library for finding the root words/meanings of words
from nltk.stem import LancasterStemmer, WordNetLemmatizer
def remove_non_ascii(words):
"""Remove non-ASCII characters from list of tokenized words"""
new_words = []
for word in words:
new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
new_words.append(new_word)
return new_words
def to_lowercase(words):
"""Convert all characters to lowercase from list of tokenized words"""
new_words = []
for word in words:
new_word = word.lower()
new_words.append(new_word)
return new_words
def remove_punctuation(words):
"""Remove punctuation from list of tokenized words"""
new_words = []
for word in words:
new_word = re.sub(r'[^ws]', '', word)
if new_word != '':
new_words.append(new_word)
return new_words
def replace:numbers(words):
"""Replace all interger occurrences in list of tokenized words with textual representation"""
p = inflect.engine()
new_words = []
for word in words:
if word.isdigit():
new_word = p.number_to_words(word)
new_words.append(new_word)
else:
new_words.append(word)
return new_words
def remove_stopwords(words):
"""Remove stop words from list of tokenized words"""
new_words = []
for word in words:
# print(word)
if word not in stopwords.words('english'):
new_words.append(word)
return new_words
def stem_words(words):
"""Stem words in list of tokenized words"""
stemmer = LancasterStemmer()
stems = []
for word in words:
stem = stemmer.stem(word)
stems.append(stem)
return stems
def lemmatize_verbs(words):
"""Lemmatize verbs in list of tokenized words"""
lemmatizer = WordNetLemmatizer()
lemmas = []
for word in words:
lemma = lemmatizer.lemmatize(word, pos='v')
lemmas.append(lemma)
return lemmas
# text processing routine in order — get the most "valuable" words
def normalize(words):
words = remove_non_ascii(words)
words = to_lowercase(words)
words = remove_punctuation(words)
# words = replace:numbers(words)
words = remove_stopwords(words)
words = stem_words(words)
words = lemmatize_verbs(words)
return words
# search.py
def getfilepath(loc):
temp = str(loc)
temp = temp.replace('', '/')
return temp
def res(jobfile):
Final_Array = []
# longest common sequence method
def lcs(X, Y):
try:
mat = []
for i in range(0,len(X)):
row = []
for j in range(0,len(Y)):
if X[i] == Y[j]:
if i == 0 or j == 0:
row.append(1)
else:
val = 1 + int( mat[i-1][j-1] )
row.append(val)
else:
row.append(0)
mat.append(row)
new_mat = []
for r in mat:
r.sort()
r.reverse()
new_mat.append(r)
lcs = 0
for r in new_mat:
if lcs < r[0]:
lcs = r[0]
return lcs
except:
return -9999
def spellCorrect(string):
words = string.split(" ")
correctWords = []
for i in words:
correctWords.append(spell(i))
return " ".join(correctWords)
# match strings (partial and whole) to sentences in resumes
def semanticSearch(searchString, searchSentencesList):
result = None
searchString = spellCorrect(searchString)
bestScore = 0
for i in searchSentencesList:
score = lcs(searchString, i) # find if search string is in a
# sentence
print(score , i[0:100])
print("")
temp = [score]
Final_Array.extend(temp)
if score> bestScore:
bestScore = score
result = i
return result
# search.py
app.config['UPLOAD_FOLDER'] = 'Original_Resumes/'
app.config['ALLOWED_EXTENSIONS'] = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
def allowed_file(filename):
return '.' in filename and
filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
Resume_Vector = []
Ordered_list_Resume = []
Ordered_list_Resume_Score = []
LIST_OF_FILES = []
LIST_OF_FILES_PDF = []
LIST_OF_FILES_DOC = []
LIST_OF_FILES_DOCX = []
Resumes_File_Names = []
Resumes = []
Temp_pdf = ''
os.chdir('./Original_Resumes')
for file in glob.glob('**/*.pdf', recursive=True):
LIST_OF_FILES_PDF.append(file)
for file in glob.glob('**/*.doc', recursive=True):
LIST_OF_FILES_DOC.append(file)
for file in glob.glob('**/*.docx', recursive=True):
LIST_OF_FILES_DOCX.append(file)
LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF
# LIST_OF_FILES.remove("antiword.exe")
print("This is LIST OF FILES")
print(LIST_OF_FILES)
# print("Total Files to Parse " , len(LIST_OF_PDF_FILES))
print("####### PARSING ########")
for nooo,i in enumerate(LIST_OF_FILES):
Ordered_list_Resume.append(i)
Temp = i.split(".")
if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
try:
print("This is PDF" , nooo)
with open(i,'rb') as pdf_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
# page = read_pdf.getPage(0)
# page_content = page.extractText()
# Resumes.extend(Temp_pdf)
number_of_pages = read_pdf.getNumPages()
for page_number in range(number_of_pages):
page = read_pdf.getPage(page_number)
page_content = page.extractText()
page_content = page_content.replace(' ', ' ')
# page_content.replace(" ", "")
Temp_pdf = Temp_pdf + str(page_content)
# Temp_pdf.append(page_content)
# print(Temp_pdf)
Resumes.extend([Temp_pdf])
Temp_pdf = ''
Resumes_File_Names.append(i)
# f = open(str(i)+str("+") , 'w')
# f.write(page_content)
# f.close()
except Exception as e: print(e)
if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
print("This is DOC" , i)
try:
a = textract.process(i)
a = a.replace(b' ', b' ')
a = a.replace(b' ', b' ')
b = str(a)
c = [b]
Resumes.extend(c)
Resumes_File_Names.append(i)
except Exception as e: print(e)
if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
print("This is DOCX" , i)
try:
a = textract.process(i)
a = a.replace(b' ', b' ')
a = a.replace(b' ', b' ')
b = str(a)
c = [b]
Resumes.extend(c)
Resumes_File_Names.append(i)
except Exception as e: print(e)
# Resumes.extend(textract.process(i))
if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
# print("This is EXE" , i)
pass
# search.py
# print("This is length of Resume Vector : " , len(Resumes))
# # # print(Resumes[1][0:10])
# for m , i in enumerate(Resumes):
# print("This is m : " , m , i[0][0:100])
# print("#######################################################################")
for m,i in enumerate(Resumes):
Resumes[m] = nltk.word_tokenize(Resumes[m])
Resumes[m] = normalize(Resumes[m])
Resumes[m] = ' '.join(map(str, Resumes[m]))
# identify the most valuable words within the job description
jobfile = nltk.word_tokenize(jobfile)
jobfile = normalize(jobfile)
jobfile = ' '.join(map(str, jobfile))
# Resumes2 = np.array(Resumes)
# Resumes2 = Resumes2.ravel()
# print(len(Resumes))
# Resumes = ['microsoft is dumb' , 'google is awesome' , 'facebook is cheater']
print("This is len Resumes : " , len(Resumes))
os.chdir('../')
print("#############################################################")
# a = input("Enter String to Search : ")
print(" ")
print("Printing Scores of all Resumes…")
print(" ")
# find resumes that map to the job description (but do we find good matches)
result = semanticSearch(jobfile, Resumes)
print(" ")
print("Printing 1 Best Result…..")
print(" ")
print (result)
print(" ")
print("##############################################")
print(" ")
print(Final_Array)
print("This is len Final_Array : " , len(Final_Array))
print(Resumes_File_Names)
print("This is len Ordered_list_Resume : " , len(Resumes_File_Names))
Ordered_list_Resume = Ordered_list_Resume[1:]
# print(Ordered_list_Resume)
Z = [x for _,x in sorted(zip(Final_Array,Resumes_File_Names) , reverse=True)]
flask_return = []
# for n,i in enumerate(Z):
# print("Rankkkkk " , n+1, ": " , i)
for n,i in enumerate(Z):
# print("Rank " , n+1, ": " , i)
# flask_return.append(str("Rank " , n+1, ": " , i))
name = getfilepath(i)
#name = name.split('.')[0]
rank = n
res = ResultElement(rank, name)
flask_return.append(res)
# res.printresult()
# print(f"Rank{res.rank+1} : {res.filename}")
return flask_return