This appendix displays the code blocks as described and discussed in Chapter 4, “Computational Thinking in Practice.” Each code block is labeled alphabetically for more convenient reference.
# screen.py
# Like in the app.py code, software modules are loaded in to help set up the
# software environment.
# No need to duplicate the code here!
# creates the ResultElement object, containing rank value and filename
class ResultElement:
def __init__(self, rank, filename):
self.rank = rank
self.filename = filename
# function: access and return the filepath reformatted using forward slashes
def getfilepath(loc):
temp = str(loc)
temp = temp.replace('', '/')
return temp
# function: recursively retrieves resume documents that are formatted as
# pdf, doc and docx. Each page of resume is captured separately and text
# extracted using extractText() for pdfs or textract.process() for MS Word
def res(jobfile):
# create empty vectors to store files of different formats, text
# from resumes, etc
Resume_Vector = []
Ordered_list_Resume = []
Ordered_list_Resume_Score = []
LIST_OF_FILES = []
LIST_OF_FILES_PDF = []
LIST_OF_FILES_DOC = []
LIST_OF_FILES_DOCX = []
Resumes = []
Temp_pdf = []
# iterate through the resumes directory, separately listing them
# into vectors
os.chdir('./Original_Resumes')
for file in glob.glob('**/*.pdf', recursive=True):
LIST_OF_FILES_PDF.append(file)
for file in glob.glob('**/*.doc', recursive=True):
LIST_OF_FILES_DOC.append(file)
for file in glob.glob('**/*.docx', recursive=True):
LIST_OF_FILES_DOCX.append(file)
# ordering files by format type gives the coder a systematic way to
# process files. It's slightly more efficient to do so as the method
# is loaded
# into the computer's memory/cache.
LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_
# FILES_PDF
print("This is LIST OF FILES")
print(LIST_OF_FILES)
# print("Total Files to Parse " , len(LIST_OF_PDF_FILES))
# iterate through filenames and parse files, executing format-
# specific methods
print("####### PARSING ########")
for nooo,i in enumerate(LIST_OF_FILES):
Ordered_list_Resume.append(i)
Temp = i.split(".")
if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
try:
print("This is PDF" , nooo)
with open(i,'rb') as pdf_file:
read_pdf = PyPDF2.PdfFileReader(pdf_file)
number_of_pages = read_pdf.getNumPages()
for page_number in range(number_of_pages):
page = read_pdf.getPage(page_number)
# main external method, we'll look under the hood # later
page_content = page.extractText()
page_content = page_content.replace(' ', ' ')
# page_content.replace(" ", "")
Temp_pdf = str(Temp_pdf) + str(page_content)
# Temp_pdf.append(page_content)
# print(Temp_pdf)
Resumes.extend([Temp_pdf])
Temp_pdf = '"
except Exception as e: print(e)
if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
print("This is DOC" , i)
try:
# main external method, we'll look under the hood later
a = textract.process(i)
a = a.replace(b' ', b' ')
a = a.replace(b' ', b' ')
b = str(a)
c = [b]
Resumes.extend(c)
except Exception as e: print(e)
if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
print("This is DOCX" , i)
try:
# main external method, it's called a few lines earlier
# in this code
a = textract.process(i)
a = a.replace(b' ', b' ')
a = a.replace(b' ', b' ')
b = str(a)
c = [b]
Resumes.extend(c)
except Exception as e: print(e)
if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
print("This is EXE" , i)
pass
print("Done Parsing.")
# screen.py
Job_Desc = 0
LIST_OF_TXT_FILES = []
os.chdir('../Job_Description')
f = open(jobfile , 'r')
text = f.read()
try:
tttt = str(text)
# automate a summary of the text tttt by selecting a few
# sentences/phrases
# to represent the entire text. In this case, the text is a job
# description.
tttt = summarize(tttt, word_count=100)
text = [tttt]
except:
text = 'None'
f.close()
# create a word storage container for all non-stop words
vectorizer = TfidfVectorizer(stop_words='english')
# sort words from 'text' into this vectorizer container and process
# idf values
vectorizer.fit(text)
# uses this vocabulary to construct tf-idf-weighted document-term
# matrix
# each word is valued by its frequency (tf) adjusted for relevance
# (idf) in docs
vector = vectorizer.transform(text)
Job_Desc = vector.toarray()
# print(" ")
# print("This is job desc : " , Job_Desc)
os.chdir('../')
for i in Resumes:
text = i
tttt = str(text)
try:
tttt = summarize(tttt, word_count=100)
text = [tttt]
vector = vectorizer.transform(text)
aaa = vector.toarray()
Resume_Vector.append(vector.toarray())
except:
pass
# print(Resume_Vector)
# screen.py
for i in Resume_Vector:
samples = i
# create a container that'll record the closest resume to
# another resume
neigh = NearestNeighbors(n_neighbors=1)
# group resumes based on a resume's frequent words
neigh.fit(samples)
NearestNeighbors(algorithm='auto', leaf_size=30)
Ordered_list_Resume_Score.extend(neigh.kneighbors(Job_Desc)[0][0].tolist())
Z = [x for _,x in sorted(zip(Ordered_list_Resume_Score,Ordered_list_
# Resume))]
print(Ordered_list_Resume)
print(Ordered_list_Resume_Score)
flask_return = []
# for n,i in enumerate(Z):
# print("Rankkkkk " , n+1, ": " , i)
for n,i in enumerate(Z):
# print("Rank " , n+1, ": " , i)
# flask_return.append(str("Rank " , n+1, ": " , i))
name = getfilepath(i)
#name = name.split('.')[0]
rank = n+1
res = ResultElement(rank, name)
flask_return.append(res)
# res.printresult()
print(f"Rank{res.rank+1} : {res.filename}")
return flask_return
if __name__ == '__main__':
inputStr = input("")
sear(inputStr)