Search in book...
Toggle Font Controls
Create new playlist

Name your new playlist

Playlist description (optional)
Sign In

Email address

Password

Forgot Password?

or

Continue with Facebook

Continue with Google
Sign Up

Full Name

Email address

Confirm Email Address

Password

or

Continue with Facebook

Continue with Google

APPENDIX C
Code for search.py

This appendix displays the code blocks as described and discussed in Chapter 4, “Computational Thinking in Practice.” Each code block is labeled alphabetically for more convenient reference.

A

# search.py
 
# Like in the app.py code, software modules are loaded in to help set up the
# software environment.
# No need to duplicate the code here!
 
# creates the ResultElement object, containing rank value and filename
app = Flask(__name__)
 
app.config['UPLOAD_FOLDER'] = 'Original_Resumes/'
app.config['ALLOWED_EXTENSIONS'] = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
 
class ResultElement:
    def __init__(self, rank, filename):
        self.rank = rank
        self.filename = filename
 
def allowed_file(filename):
    return '.' in filename and 
           filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
 
import re, string, unicodedata # software libraries for regular expressions, strings.
import nltk # natural language toolkit software library
import contractions # software library to handle English's contraction structure
import inflect
# software library for pulling data out of HTML and XML files
from bs4 import BeautifulSoup
# software library for identifying words (tokens) in a stream of text
from nltk import word_tokenize, sent_tokenize
# software library that lists stop words for each language
from nltk.corpus import stopwords
# software library for finding the root words/meanings of words
from nltk.stem import LancasterStemmer, WordNetLemmatizer
 
def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words
 
def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words
 
def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^ws]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words
 
def replace:numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words
 
def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        # print(word)
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words
 
def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems
 
def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas
# text processing routine in order — get the most "valuable" words
def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    # words = replace:numbers(words)
    words = remove_stopwords(words)
    words = stem_words(words)
    words = lemmatize_verbs(words)
    return words

B

# search.py
def getfilepath(loc):
    temp = str(loc)
    temp = temp.replace('', '/')
    return temp
 
def res(jobfile):
    Final_Array = []
    
    # longest common sequence method
    def lcs(X, Y):
        try:
            mat = []
            for i in range(0,len(X)):
                row = []
                for j in range(0,len(Y)):
                    if X[i] == Y[j]:
                        if i == 0 or j == 0:
                            row.append(1)
                        else:
                            val = 1 + int( mat[i-1][j-1] )
                            row.append(val)
                    else:
                        row.append(0)
                mat.append(row)
            new_mat = []
            for r in  mat:
                r.sort()
                r.reverse()
                new_mat.append(r)
            lcs = 0
            for r in new_mat:
                if lcs < r[0]:
                    lcs = r[0]
            return lcs
        except:
            return -9999
    
    def spellCorrect(string):
        words = string.split(" ")
        correctWords = []
        for i in words:
            correctWords.append(spell(i))
        return " ".join(correctWords)
    
    # match strings (partial and whole) to sentences in resumes
    def semanticSearch(searchString, searchSentencesList):
        result = None
        searchString = spellCorrect(searchString)
        bestScore = 0
        for i in searchSentencesList:
            score = lcs(searchString, i) # find if search string is in a
                                         # sentence
            print(score , i[0:100])
            print("")
            temp = [score]
            Final_Array.extend(temp)
            if score> bestScore:
                bestScore = score
                result = i
        return result

C

# search.py
 
    app.config['UPLOAD_FOLDER'] = 'Original_Resumes/'
    app.config['ALLOWED_EXTENSIONS'] = set(['txt', 'pdf', 'png', 'jpg', 'jpeg', 'gif'])
 
    def allowed_file(filename):
        return '.' in filename and 
            filename.rsplit('.', 1)[1] in app.config['ALLOWED_EXTENSIONS']
 
    Resume_Vector = []
    Ordered_list_Resume = []
    Ordered_list_Resume_Score = []
    LIST_OF_FILES = []
    LIST_OF_FILES_PDF = []
    LIST_OF_FILES_DOC = []
    LIST_OF_FILES_DOCX = []
    Resumes_File_Names = []
    Resumes = []
    Temp_pdf = ''
    os.chdir('./Original_Resumes')
    for file in glob.glob('**/*.pdf', recursive=True):
        LIST_OF_FILES_PDF.append(file)
    for file in glob.glob('**/*.doc', recursive=True):
        LIST_OF_FILES_DOC.append(file)
    for file in glob.glob('**/*.docx', recursive=True):
        LIST_OF_FILES_DOCX.append(file)
 
    LIST_OF_FILES = LIST_OF_FILES_DOC + LIST_OF_FILES_DOCX + LIST_OF_FILES_PDF
    # LIST_OF_FILES.remove("antiword.exe")
    print("This is LIST OF FILES")
    print(LIST_OF_FILES)
 
    # print("Total Files to Parse	" , len(LIST_OF_PDF_FILES))
    print("####### PARSING ########")
    for nooo,i in enumerate(LIST_OF_FILES):
        Ordered_list_Resume.append(i)
        Temp = i.split(".")
        if Temp[1] == "pdf" or Temp[1] == "Pdf" or Temp[1] == "PDF":
            try:
                print("This is PDF" , nooo)
                with open(i,'rb') as pdf_file:
                    read_pdf = PyPDF2.PdfFileReader(pdf_file)
                    # page = read_pdf.getPage(0)
                    # page_content = page.extractText()
                    # Resumes.extend(Temp_pdf)
 
                    number_of_pages = read_pdf.getNumPages()
                    for page_number in range(number_of_pages): 
 
                        page = read_pdf.getPage(page_number)
                        page_content = page.extractText()
                        page_content = page_content.replace('
', ' ')
                        # page_content.replace("
", "")
                        Temp_pdf = Temp_pdf + str(page_content)
                        # Temp_pdf.append(page_content)
                        # print(Temp_pdf)
                    Resumes.extend([Temp_pdf])
                    Temp_pdf = ''
                    Resumes_File_Names.append(i)
                    # f = open(str(i)+str("+") , 'w')
                    # f.write(page_content)
                    # f.close()
            except Exception as e: print(e)
        if Temp[1] == "doc" or Temp[1] == "Doc" or Temp[1] == "DOC":
            print("This is DOC" , i)
                    
            try:
                a = textract.process(i)
                a = a.replace(b'
',  b' ')
                a = a.replace(b'
',  b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
                Resumes_File_Names.append(i)
            except Exception as e: print(e)
                
                    
        if Temp[1] == "docx" or Temp[1] == "Docx" or Temp[1] == "DOCX":
            print("This is DOCX" , i)
            try:
                a = textract.process(i)
                a = a.replace(b'
',  b' ')
                a = a.replace(b'
',  b' ')
                b = str(a)
                c = [b]
                Resumes.extend(c)
                Resumes_File_Names.append(i)
            except Exception as e: print(e)
        # Resumes.extend(textract.process(i))
        if Temp[1] == "ex" or Temp[1] == "Exe" or Temp[1] == "EXE":
            # print("This is EXE" , i)
            pass

D

# search.py
 
    # print("This is length of Resume Vector : " , len(Resumes))
    # # # print(Resumes[1][0:10])
    # for m , i in enumerate(Resumes):
    #     print("This is m : " , m , i[0][0:100])
    #     print("#######################################################################")
 
    for m,i in enumerate(Resumes):
        Resumes[m] = nltk.word_tokenize(Resumes[m])
        Resumes[m] = normalize(Resumes[m])
        Resumes[m] = ' '.join(map(str, Resumes[m]))
   
   # identify the most valuable words within the job description 
    jobfile = nltk.word_tokenize(jobfile)
    jobfile = normalize(jobfile)
    jobfile = ' '.join(map(str, jobfile))
    # Resumes2 = np.array(Resumes)
 
    # Resumes2 = Resumes2.ravel()
 
    # print(len(Resumes))
 
    # Resumes = ['microsoft is dumb' , 'google is awesome' , 'facebook is cheater']
    print("This is len Resumes : " , len(Resumes))
    os.chdir('../')
        
print("#############################################################")
    # a = input("Enter String to Search : ")
    print("

")
    print("Printing Scores of all Resumes…")
    print("
")
    # find resumes that map to the job description (but do we find good matches)
    result = semanticSearch(jobfile, Resumes)
    print("
")
    print("Printing 1 Best Result…..")
    print("
")
    print (result)
    print("

")
    print("##############################################")
    print("

")
    print(Final_Array)
    print("This is len Final_Array : " , len(Final_Array))
    print(Resumes_File_Names)
    print("This is len Ordered_list_Resume : " , len(Resumes_File_Names))
    Ordered_list_Resume = Ordered_list_Resume[1:]
    # print(Ordered_list_Resume)
 
    Z = [x for _,x in sorted(zip(Final_Array,Resumes_File_Names) , reverse=True)]
    flask_return = []
    # for n,i in enumerate(Z):
    #     print("Rankkkkk	" , n+1, ":	" , i)
 
    for n,i in enumerate(Z):
        # print("Rank	" , n+1, ":	" , i)
        # flask_return.append(str("Rank	" , n+1, ":	" , i))
        name = getfilepath(i)
        #name = name.split('.')[0]
        rank = n
        res = ResultElement(rank, name)
        flask_return.append(res)
        # res.printresult()
        # print(f"Rank{res.rank+1} :	 {res.filename}")
    return flask_return

..................Content has been hidden....................

You can't read the all page of ebook, please click here login for view all page.

Table of Contents for APPENDIX C: Code for search.py

Create new playlist

Sign In

Sign Up

A

B

C

D

Table of Contents for
APPENDIX C: Code for search.py