top of page

IR - PAGERANK

​

# import some stuff import numpy as np from scipy.sparse import csc_matrix from fractions import Fraction # keep it clean and tidy def float_format(vector, decimal): return np.round((vector).astype(np.float), decimals=decimal) G = np.matrix([[1,1,0], [1,0,1], [0,1,0]]) n=len(G) #print(n) # transform G into markov matrix A M = csc_matrix(G,dtype=np.float) rsums = np.array(M.sum(1))[:,0] ri, ci = M.nonzero() M.data /= rsums[ri] # WWW matrix # we have 3 webpages and probability of landing to each one is 1/3 #(default Probability) #n=len(M) dp = Fraction(1,n) E = np.zeros((3,3)) E[:] = dp # taxation beta = 0.85 # WWW matrix A = beta * M + ((1-beta) * E) # initial vector r = np.matrix([dp, dp, dp]) r = np.transpose(r) previous_r = r for it in range(1,30): r = A * r #check if converged if (previous_r==r).all(): break previous_r = r print ("Final:\n", float_format(r,3)) print( "sum", np.sum(r))

IR - BITWISE

​

plays={"Anthony and Cleopatra":"Anthony is there, Brutus is Caeser is with Cleopatra mercy worser.", "Julius Ceaser":"Anthony is there, Brutus is Caeser is but Calpurnia is.", "The Tempest":"mercy worser","Hamlet":"Caeser and Brutus are present with mercy and worser", "Othello":"Caeser is present with mercy and worser","Macbeth":"Anthony is there, Caeser, mercy."} words=["Anthony","Brutus","Caeser","Calpurnia","Cleopatra","mercy","worser"] vector_matrix=[[0 for i in range(len(plays))] for j in range(len(words))] text_list=list((plays.values())) for i in range(len(words)): for j in range(len(text_list)): if words[i] in text_list[j]: vector_matrix[i][j]=1 else: vector_matrix[i][j]=0 for i in vector_matrix: print(i) result=[] string_list=[] for vector in vector_matrix: mystring = "" for digit in vector: mystring += str(digit) string_list.append(int(mystring,2)) #print(string_list) print("The output is ",bin(string_list[0]&string_list[1]&(string_list[2])).replace("0b",""))

IR - COMPARE FILES

​

def process(file): raw=open (file).read() tokens=word_tokenize(raw) words=[w.lower() for w in tokens] porter= nltk.PorterStemmer() stemmed_tokens=[porter.stem(t) for t in words] # removing stop words stop_words=set(stopwords.words('english')) filtered_tokens=[w for w in stemmed_tokens if not w in stop_words] #count words count=nltk.defaultdict(int) for word in filtered_tokens: count[word]+=1 return count; def cos_sim(a,b): dot_product=np.dot(a,b) norm_a=np.linalg.norm(a) norm_b=np.linalg.norm(b) return dot_product/(norm_a * norm_b) def getSimilarity(dict1,dict2): all_words_list=[] for key in dict1: all_words_list.append(key) for key in dict2: all_words_list.append(key) all_words_list_size=len(all_words_list) v1=np.zeros(all_words_list_size,dtype=np.int) v2=np.zeros(all_words_list_size,dtype=np.int) i=0 for (key) in all_words_list: v1[i]=dict1.get(key,0) v2[i]=dict2.get(key,0) i=i+1 return cos_sim(v1,v2) if __name__ == '__main__': dict1=process("text1.txt") dict2=process("text2.txt") print("Similarity between two text documents",getSimilarity(dict1,dict2))

IR - TWITTER MINING

​

import tweepy from tkinter import * from time import sleep from datetime import datetime from textblob import TextBlob import matplotlib.pyplot as plt consumer_key = 'your consumer key' consumer_secret = 'your consumer secret' access_token = 'your access token' access_token_secret = 'you access token secret' auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_token, access_token_secret) api = tweepy.API(auth) user = api.me() print (user.name) #GUI root = Tk() label1 = Label(root, text="Search") E1 = Entry(root, bd =5) label2 = Label(root, text="Sample Size") E2 = Entry(root, bd =5) def getE1(): return E1.get() def getE2(): return E2.get() def getData(): getE1() keyword = getE1() getE2() numberOfTweets = getE2() numberOfTweets = int(numberOfTweets) #Where the tweets are stored to be plotted polarity_list = [] numbers_list = [] number = 1 for tweet in tweepy.Cursor(api.search, keyword, lang="en").items(numberOfTweets): try: analysis = TextBlob(tweet.text) analysis = analysis.sentiment polarity = analysis.polarity polarity_list.append(polarity) numbers_list.append(number) number = number + 1 except tweepy.TweepError as e: print(e.reason) except StopIteration: break #Plotting axes = plt.gca() axes.set_ylim([-1, 2]) plt.scatter(numbers_list, polarity_list) averagePolarity = (sum(polarity_list))/(len(polarity_list)) averagePolarity = "{0:.0f}%".format(averagePolarity * 100) time = datetime.now().strftime("At: %H:%M\nOn: %m-%d-%y") plt.text(0, 1.25, "Average Sentiment: " + str(averagePolarity) + "\n" + time, fontsize=12, bbox = dict(facecolor='none', edgecolor='black', boxstyle='square, pad = 1')) plt.title("Sentiment of " + keyword + " on Twitter") plt.xlabel("Number of Tweets") plt.ylabel("Sentiment") plt.show() submit = Button(root, text ="Submit", command = getData) label1.pack() E1.pack() label2.pack() E2.pack() submit.pack(side =BOTTOM) root.mainloop()

IR - EDIT DISTANCE

​

# A Naive recursive Python program to fin minimum number # operations to convert str1 to str2 def editDistance(str1, str2, m , n): # If first string is empty, the only option is to # insert all characters of second string into first if m==0: return n # If second string is empty, the only option is to # remove all characters of first string if n==0: return m # If last characters of two strings are same, nothing # much to do. Ignore last characters and get count for # remaining strings. if str1[m-1]==str2[n-1]: return editDistance(str1,str2,m-1,n-1) # If last characters are not same, consider all three # operations on last character of first string, recursively # compute minimum cost for all three operations and take # minimum of three values. return 1 + min(editDistance(str1, str2, m, n-1), # Insert editDistance(str1, str2, m-1, n), # Remove editDistance(str1, str2, m-1, n-1) # Replace ) # Driver program to test the above function str1 = "sun" str2 = "sat" print("The total number of operations needed is : ",editDistance(str1, str2, len(str1), len(str2))

IR - STOPWORDS

​

from nltk.corpus import stopwords from nltk.tokenize import word_tokenize example_sent = "This is a sample sentence, showing off the stop words filtration" stop_words = set(stopwords.words('english')) word_tokens = word_tokenize(example_sent) filtered_sentence = [w for w in word_tokens if not w in stop_words] filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) print(word_tokens) print(filtered_sentence)

IR - SIMPLE WEB CRAWLER

​

from html.parser import HTMLParser from urllib.request import urlopen from urllib import parse import sys, json class LinkParser(HTMLParser): def handle_starttag(self, tag, attrs): if tag == "a": for (key, value) in attrs: if key == "href": newUrl = parse.urljoin(self.baseUrl, value) self.links = self.links + [newUrl] def getLinks(self, url): self.links = [] self.baseUrl = url response = urlopen(url) if "text/html" in response.getheader("Content-Type"): htmlContent = response.read() htmlString = htmlContent.decode("utf-8") self.feed(htmlString) response.close() return htmlString, self.links else: return "", [] def crawl(url, word): # List of found urls foundUrl = [] # List of already visited url to prevent revisiting the same url twice visitedURL = [] # Keeping count of all the pages visited numberVisited = 0; # If no words found show error foundWord = False # Starting the parser class parser = LinkParser() # Checking the first url data, links = parser.getLinks(url) links.append(url) # Looping all the links for link in links: # Kinda straight foward... numberVisited = numberVisited + 1 try: # Checking if link has not been visited yet if link not in visitedURL: # Appending link to VisiterURL list visitedURL.append(link) data, li = parser.getLinks(link) print (numberVisited, "Scanning URL ", link) if data.find(word) > -1: foundWord = True foundUrl.append(link) print("-" * 10) print(" ") print("The word", word, "was found at", link) print(" ") print("-" * 10) else: print ("Matches Not Found") except: print (" **Failed **", "") #If the word was never found show the error if foundWord == False: print ("The word", word, "was not found!") print ("Finished, crawled", numberVisited, "pages") print (json_list(foundUrl)) def json_list(list): lst = [] d = {} for pn in list: d=pn lst.append(d) return json.dumps(lst, separators=(',',':')) crawl("https://www.facebook.com", "login")

IR - XML PARSE TEXT

​

import csv import requests import xml.etree.ElementTree as ET def loadRSS(): # url of rss feed url = 'http://www.hindustantimes.com/rss/topnews/rssfeed.xml' # creating HTTP response object from given url resp = requests.get(url) # saving the xml file with open('topnewsfeed.xml', 'wb') as f: f.write(resp.content) def parseXML(xmlfile): # create element tree object tree = ET.parse(xmlfile) # get root element root = tree.getroot() # create empty list for news items newsitems = [] # iterate news items for item in root.findall('./channel/item'): # empty news dictionary news = {} # iterate child elements of item for child in item: # special checking for namespace object content:media if child.tag == '{http://search.yahoo.com/mrss/}content': news['media'] = child.attrib['url'] else: news[child.tag] = child.text.encode('utf8') # append news dictionary to news items list newsitems.append(news) # return news items list return newsitems def savetoCSV(newsitems, filename): # specifying the fields for csv file fields = ['guid', 'title', 'pubDate', 'description', 'link', 'media'] # writing to csv file with open(filename, 'w') as csvfile: # creating a csv dict writer object writer = csv.DictWriter(csvfile, fieldnames = fields) # writing headers (field names) writer.writeheader() # writing data rows writer.writerows(newsitems) def main(): # load rss from web to update existing xml file loadRSS() # parse xml file newsitems = parseXML('topnewsfeed.xml') # store news items in a csv file savetoCSV(newsitems, 'topnews.csv') if __name__ == "__main__": # calling main function main()

IR - Occurency

​

def word_count(str): counts = dict() words = str.split() for word in words: if word in counts: counts[word] += 1 else: counts[word] = 1 return counts print( word_count('the quick brown fox jumps over the lazy dog.'))

IR - INCIDENCY

​

import pandas as pd from sklearn.feature_extraction.text import CountVectorizer docs = ['why hello there', 'omg hello pony', 'she went there? omg'] vec = CountVectorizer() X = vec.fit_transform(docs) df = pd.DataFrame(X.toarray(), columns=vec.get_feature_names()) print(df)

python -m pip install --upgrade pip

bottom of page