APPENDIX B
Code for screen.py

This appendix displays the code blocks as described and discussed in Chapter 4, “Computational Thinking in Practice.” Each code block is labeled alphabetically for more convenient reference.

A

# screen.py
 
# Like in the app.py code, software modules are loaded in to help set up the
# software environment.
# No need to duplicate the code here!
 
# creates the ResultElement object, containing rank value and filename
class ResultElement:
    def __init__(self, rank, filename):
        self.rank = rank
        self.filename = filename
 
# function: access and return the filepath reformatted using forward slashes
def getfilepath(loc):
    temp = str(loc)
    temp = temp.replace('', '/')
    return temp
 
# function: recursively retrieves resume documents that are formatted as 
# pdf, doc and docx. Each page of resume is captured separately and text
# extracted using extractText() for pdfs or textract.process() for MS Word
def res(jobfile):
    # create empty vectors to store files of different formats, text
    # from resumes, etc
    Resume_Vector = []
    Ordered_list_Resume = []
    Ordered_list_Resume_Score = []
    LIST_OF_FILES = []
    LIST_OF_FILES_PDF = []
    LIST_OF_FILES_DOC = []
    LIST_OF_FILES_DOCX = []
    Resumes = [] 
    Temp_pdf = []
 
    # iterate through the resumes directory, separately listing them
    # into vectors
    os.chdir('./Original_Resumes')
    for file in glob.glob('**/*.pdf', recursive=True):
        LIST_OF_FILES_PDF.append(file)
    for file in glob.glob('**/*.doc', recursive=True):
        LIST_OF_FILES_DOC.append(file)
    for file in glob.glob('**/*.docx', recursive=True):
        LIST_OF_FILES_DOCX.append(file)
 
    # ordering files by format type gives the coder a systematic way to 
    # process files. It's slightly more efficient to do so as the method
    # is loaded
    # into the computer's memory/cache.  
    LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_
    # FILES_PDF
  
    print("This is LIST OF FILES")
    print(LIST_OF_FILES)
 
    # print("Total Files to Parse	" , len(LIST_OF_PDF_FILES))
    # iterate through filenames and parse files, executing format-
    # specific methods
    print("####### PARSING ########")
    for nooo,i in enumerate(LIST_OF_FILES):
        Ordered_list_Resume.append(i)
        Temp = i.split(".")
        if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
            try:
                print("This is PDF" , nooo)
                with open(i,'rb') as pdf_file:
                    read_pdf = PyPDF2.PdfFileReader(pdf_file)
                
 
                    number_of_pages = read_pdf.getNumPages()
                    for page_number in range(number_of_pages): 
                        page = read_pdf.getPage(page_number)
                        # main external method, we'll look under the hood                         # later
                        page_content = page.extractText()
                        page_content = page_content.replace('
', ' ')
                        # page_content.replace("
", "")
                        Temp_pdf = str(Temp_pdf) + str(page_content)
                        # Temp_pdf.append(page_content)
                        # print(Temp_pdf)
                    Resumes.extend([Temp_pdf])
                    Temp_pdf = '"
            except Exception as e: print(e)
        if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
            print("This is DOC" , i)
                
            try:
               # main external method, we'll look under the hood later
                a = textract.process(i)
                a = a.replace(b'
',  b' ')
                a = a.replace(b'
',  b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e: print(e)
                
        if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
            print("This is DOCX" , i)
            try:
               # main external method, it's called a few lines earlier
               # in this code
                a = textract.process(i)
                a = a.replace(b'
',  b' ')
                a = a.replace(b'
',  b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
            except Exception as e: print(e)
                    
                
        if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
            print("This is EXE" , i)
            pass
 
    print("Done Parsing.")
 

B

# screen.py
 
    Job_Desc = 0
    LIST_OF_TXT_FILES = []
    os.chdir('../Job_Description')
    f = open(jobfile , 'r')
    text = f.read()
        
    try:
        tttt = str(text)
        # automate a summary of the text tttt by selecting a few
        # sentences/phrases
        # to represent the entire text. In this case, the text is a job
        # description. 
        tttt = summarize(tttt, word_count=100)
        text = [tttt]
    except:
        text = 'None'
 
    f.close()
    # create a word storage container for all non-stop words
    vectorizer = TfidfVectorizer(stop_words='english')
 
    # sort words from 'text' into this vectorizer container and process
    # idf values
    vectorizer.fit(text)
    # uses this vocabulary to construct tf-idf-weighted document-term
    # matrix
    # each word is valued by its frequency (tf) adjusted for relevance
    # (idf) in docs 
    vector = vectorizer.transform(text)
 
    Job_Desc = vector.toarray()
    # print("

")
    # print("This is job desc : " , Job_Desc)
 
    os.chdir('../')
    for i in Resumes:
        text = i
        tttt = str(text)
        try:
            tttt = summarize(tttt, word_count=100) 
            text = [tttt]
            vector = vectorizer.transform(text)
 
            aaa = vector.toarray()
            Resume_Vector.append(vector.toarray())
        except:
            pass
    # print(Resume_Vector)
 

C

# screen.py
    for i in Resume_Vector:
              samples = i
              # create a container that'll record the closest resume to
              # another resume
       neigh = NearestNeighbors(n_neighbors=1)
       # group resumes based on a resume's frequent words
             neigh.fit(samples) 
             NearestNeighbors(algorithm='auto', leaf_size=30)
Ordered_list_Resume_Score.extend(neigh.kneighbors(Job_Desc)[0][0].tolist())
 
    Z = [x for _,x in sorted(zip(Ordered_list_Resume_Score,Ordered_list_
    # Resume))]
    print(Ordered_list_Resume)
    print(Ordered_list_Resume_Score)
    flask_return = []
    # for n,i in enumerate(Z):
    #     print("Rankkkkk	" , n+1, ":	" , i)
 
    for n,i in enumerate(Z):
        # print("Rank	" , n+1, ":	" , i)
        # flask_return.append(str("Rank	" , n+1, ":	" , i))
        name = getfilepath(i)
        #name = name.split('.')[0]
        rank = n+1
        res = ResultElement(rank, name)
        flask_return.append(res)
        # res.printresult()
        print(f"Rank{res.rank+1} :	 {res.filename}")
    return flask_return
 
if __name__ == '__main__':
    inputStr = input("")
    sear(inputStr)
 
..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.
Reset