Demo entry 6360532

project2

   

Submitted by Yibo Ma on May 02, 2017 at 06:47
Language: Python. Code size: 5.7 kB.

import bs4
from bs4 import BeautifulSoup
import urllib
import re
import string
import sqlite3
import networkx as nx
import matplotlib.pyplot as plt

def parseHTML(html):
	fhand=open('filelist.txt','r')
	lst=[]
	for line in fhand.readlines():
		line=line.rstrip()
		line=line[24:]
		lst.append(line)
	fhand=urllib.urlopen(html)
	inp=fhand.read()
	soup=BeautifulSoup(inp,'lxml')
	dic={}
	tags=soup('a')
	for tag in tags:
		url=tag.get('href',None)
		if url==None:
			continue
		if re.findall('^/wiki/[\w_:]+',url)==[url] and url!=html[24:] and url in lst:
			dic[url]=dic.get(url,0)+1
		if re.findall('^https://en.wikipedia.org/wiki/[\w_:]+',url)==[url] and url!=html and url[24:] in lst:
			url=url[24:]
			dic[url]=dic.get(url,0)+1
	return dic


def createDataSet(filelist):
	fhand=open(filelist,'r')
	dic_dic={}
	for line in fhand.readlines():
		line=line.rstrip()
		dic=parseHTML(line)
		dic_dic[line[24:]]=dic
	return dic_dic

def createPageScore():
	dic={}
	fhand=open('filelist.txt').readlines()
	for item in fhand:
		item=item.rstrip()
		dic[item[24:]]=1.0/len(fhand)
	return dic


def fstiterateRJF():
	dic_dic=createDataSet('filelist.txt')
	initialscoredic=createPageScore()
	fhand=open('filelist.txt').readlines()
	dic={}
	for item in fhand:
		item=item.rstrip()
		(html,rankscore)=calPageRankRJF(dic_dic,item[24:],initialscoredic)
		dic[html]=rankscore
	return dic_dic,dic

def calPageRankRJF(dic_dic,html,dic):
	rankscore=0.15/len(dic_dic.keys())
	for page_name,containedlink in dic_dic.items():
		if html in containedlink.keys():
			rankscore+=0.85*dic[page_name]/len(containedlink)
	return html,rankscore

def iteratePageRankRJF(dic_dic,dic):
	newdic={}
	for key in dic.keys():
		html,rankscore=calPageRankRJF(dic_dic,key,dic)
		newdic[html]=rankscore
	return newdic

def convergeRJF(threshold):
	count=1
	dic_dic,dic=fstiterateRJF()
	for key,value in dic.items():
		print key,value
	while True:
		if threshold<=count:
			break
		dic=iteratePageRank(dic_dic,dic)
		for key,value in dic.items():
			print key,value
		count+=1
	return dic

def calPageRank(dic_dic,html,dic):
	rankscore=float()
	for page_name,containedlink in dic_dic.items():
		if html in containedlink.keys():
			rankscore+=dic[page_name]/len(containedlink)
	return (html,rankscore)

def fstiterate():
	dic_dic=createDataSet('filelist.txt')
	initialscoredic=createPageScore()
	fhand=open('filelist.txt').readlines()
	dic={}
	for item in fhand:
		item=item.rstrip()
		(html,rankscore)=calPageRank(dic_dic,item[24:],initialscoredic)
		dic[html]=rankscore
	return dic_dic,dic


def iteratePageRank(dic_dic,dic):
	newdic={}
	for key in dic.keys():
		html,rankscore=calPageRank(dic_dic,key,dic)
		newdic[html]=rankscore
	return newdic

def converge(threshold):
	count=1
	dic_dic,dic=fstiterate()
	#for key,value in dic.items():
	#	print key,value
	while True:
		if threshold<=count:
			break
		dic=iteratePageRank(dic_dic,dic)
	#	for key,value in dic.items():
	#		print key,value
		count+=1
	return dic


def visible(element):
	if element.parent.name in ['style','script','[document]','head','title']:
		return False
	elif isinstance(element,bs4.element.Comment):
		return False
	return True

def stopwordlist():
	fhand=open('stopword.csv','r').readlines()
	dic={}
	for line in fhand:
		line=line.split(',')
		for word in line:
			word=word.strip()
			word=word.translate(None,string.punctuation)
			if word!='': dic[word]=dic.get(word,0)+1
	return dic

def parseWebContent(html,stopwordlist,count):
	wordic={}
	url=urllib.urlopen(html).read()
	soup=BeautifulSoup(url,'lxml')
	texts=soup.find_all(text=True)
	visible_texts=filter(visible,texts)
	for strg in visible_texts:
		try:
			strg=str(strg).strip()
			strg=strg.translate(None,string.punctuation)
			strg=strg.lower()
			words=strg.split()		
			for word in words:
				if word=='' or word in stopwordlist:
					continue
				wordic[word]=wordic.get(word,0)+1		
		except:
			continue
	conn=sqlite3.connect('webpage_words.sqlite3')
	cur=conn.cursor()
	for key,value in sorted(wordic.items(),key=lambda item:item[1],reverse=True):
		count+=1
		cur.execute('INSERT INTO Table1 (id,page_name,word,frequency) VALUES(?,?,?,?)',\
			(count,html[24:],key,value))
	conn.commit()
	return count

def lookup():
	term=raw_input('Please enter the word you want to look up:')
	conn=sqlite3.connect('webpage_words.sqlite3')
	cur=conn.cursor()
	cur.execute("SELECT Table1.page_name,(Table1.frequency*webpage_rankscore.rank_score) AS score FROM Table1 JOIN webpage_rankscore ON Table1.page_name=webpage_rankscore.page_name WHERE word='%s' ORDER BY score DESC LIMIT 10" %term)
	for row in cur:
		weblink,rankscore=row
		print 'https://en.wikipedia.org'+weblink+', rankscore:'+str(round(rankscore*1000,3))
	cur.close()


def lookupRJF():
	term=raw_input('Please enter the word you want to look up:')
	conn=sqlite3.connect('webpage_words.sqlite3')
	cur=conn.cursor()
	cur.execute("SELECT Table1.page_name,(Table1.frequency*webpage_rankscore.rank_score) AS score FROM Table1 JOIN webpage_rankscore ON Table1.page_name=webpage_rankscore.page_name WHERE word='%s' ORDER BY score DESC LIMIT 10" %term)
	for row in cur:
		weblink,rankscore=row
		print 'https://en.wikipedia.org'+weblink+', rankscore:'+str(round(rankscore*1000,3))
	cur.close()


def visualizeDataset():
	g=nx.Graph()
	dic_dic=createDataSet('filelist.txt')
	for page_name,containedlink in dic_dic.items():
		for html in containedlink.keys():
			g.add_edge(page_name,html)
	nx.draw(g)
	plt.show()

def chooselookuptype():
	cmd=raw_input('Page rank with a random jump factor: y/n')
	if cmd=='y':lookupRJF()
	elif cmd=='n':lookup()
	else: print 'Error: wrong input!'

dic_dic,dic=fstiterateRJF()
print  dic_dic
print dic

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).