This is an interesting application of natural language processing. Sentence auto-completion is an interesting feature that is shockingly absent in our modern-day browsers and mobile interfaces. Getting grammatically and contextually relevant suggestions as to what to type next, while we are typing a few words, would be such a great feature to have.
Coursera, in one of the data science courses by Johns Hopkins, provided four compressed datasets that contain terms and frequencies of unigram, bigram, trigram, and 4-gram in four datasets. The problem at hand was to come up with a model that can learn to predict relevant words to type next.
The following code uses the Katz-Backoff algorithm, leveraging the four n-gram term frequency datasets to predict the next word in a sentence:
library(tm) library(stringr) # Load the n-gram data load("/data_frame1.RData"); load("/data_frame2.RData"); load("/data_frame3.RData"); load("/data_frame4.RData"); CleanInputString<- function(input_string) { # cleaning up data input_string<- iconv(input_string, "latin1", "ASCII", sub=" "); input_string<- gsub("[^[:alpha:][:space:][:punct:]]", "", input_string); # corpus input_corpus<- VCorpus(VectorSource(input_string)) input_corpus<- tm_map(input_corpus, content_transformer(tolower)) input_corpus<- tm_map(input_corpus, removePunctuation) input_corpus<- tm_map(input_corpus, removeNumbers) input_corpus<- tm_map(input_corpus, stripWhitespace) input_string<- as.character(input_corpus[[1]]) input_string<- gsub("(^[[:space:]]+|[[:space:]]+$)", "", input_string) if (nchar(input_string) > 0) { return(input_string); } else { return(""); } } Get_next_word<- function(input_string) { # Data cleansing using the function written earlier input_string<- CleanInputString(input_string); # extract the string length input_string<- unlist(strsplit(input_string, split=" ")); input_stringLen<- length(input_string); next_word_present<- FALSE; term_next<- as.character(NULL); # Katz- backoffN-gram model if (input_stringLen>= 3 & !next_word_present) { # collate the terms input_string1 <- paste(input_string[(input_stringLen-2):input_stringLen], collapse=" "); # take the subset of 4-gram data searchStr<- paste("^",input_string1, sep = ""); data_frame4Temp <- data_frame4[grep (searchStr, data_frame4$terms), ]; if ( length(data_frame4Temp[,1]) > 1 ) { term_next<- data_frame4Temp[1:10,1];# select 10 matching terms next_word_present<- TRUE; } data_frame4Temp <- NULL; } # 2. lets go to n-1 gram if (input_stringLen>= 2 & !next_word_present) { # collate input terms input_string1 <- paste(input_string[(input_stringLen-1):input_stringLen], collapse=" "); searchStr<- paste("^",input_string1, sep = ""); data_frame3Temp <- data_frame3[grep (searchStr, data_frame3$terms), ]; if ( length(data_frame3Temp[, 1]) > 1 ) { term_next<- data_frame3Temp[1:10,1]; next_word_present<- TRUE; } data_frame3Temp <- NULL; } if (input_stringLen>= 1 & !next_word_present) { input_string1 <- input_string[input_stringLen]; searchStr<- paste("^",input_string1, sep = ""); data_frame2Temp <- data_frame2[grep (searchStr, data_frame2$terms), ]; if ( length(data_frame2Temp[, 1]) > 1 ) { term_next<- data_frame2Temp[1:10,1]; next_word_present<- TRUE; } data_frame2Temp <- NULL; } if (!next_word_present&input_stringLen> 0) { term_next<- data_frame1$terms[1,1]; } word_nxt<- word(term_next, -1); if (input_stringLen> 0){ df<- data.frame(word_nxt); return(df); } else { word_nxt<- ""; df<- data.frame(word_nxt); return(df); } }
Let's try out the function for an incomplete sentence:
Get_next_word("I AM") word_nxt 1 so 2 a 3 not 4 going 5 in 6 sure 7 the 8 still 9 just 10 now Get_next_word("I AM SO") word_nxt 1 excited 2 happy 3 glad 4 thankful 5 proud 6 blessed 7 tired 8 so 9 jealous 10 very