Demo entry 6321359

huh

   

Submitted by anonymous on Nov 10, 2016 at 23:29
Language: Python 3. Code size: 3.7 kB.

import os
import collections
import sys
import csv
import re

class Inv_idx(): # class to do indexing

    def __init__(self, path):
        self.corpus = path
        self.d_terms = {}
        self.postings = collections.defaultdict(list)

    def get_postings(self):
        with open(self.corpus, 'rt', encoding='utf-8') as csvfile:
            csvreader = csv.reader(csvfile, delimiter='\t')
            for idx,row in enumerate(csvreader):
                if idx > 10:
                    break
                try:
                    tweet = tokenize_normalize(row[4]) #tokenize each tweet
                except IndexError: #cath exceptions of index
                    pass
                for token in tweet:
                    term_id = hash(token)
                    self.postings[term_id].append(idx) # populate self.postings with lists where documentID is appended for each termID
                    self.d_terms[token] = ((len(self.postings[term_id]),term_id)) # populate self.d_terms with lists where documentID is appended for each termID
 
    def retrieve_originals(self,document_ids):
        hits = []
        with open(self.corpus, 'rt', encoding='utf-8') as csvfile:
            csvreader = csv.reader(csvfile, delimiter='\t')
            hits = [(row[1],row[4]) for idx, row in enumerate(csvreader) if idx in document_ids] #saves the lines that correspond to the document ids, so the tweetids and the tweet texts
        return hits

    def query(self,*args):
        if len(args) == 1: 
            query = args[0]
            term_id = self.d_terms[query][1]
            document_ids = self.postings[term_id]
            search_results = self.retrieve_originals(document_ids)
        elif len(args) == 2:
            search_results = []
            lengthwise_sorted = sorted([(self.d_terms[arg][0],arg) for arg in args], key = lambda x:x[0])
            queries = [x[1] for x in lengthwise_sorted]
            term_ids = [self.d_terms[query][1] for query in queries]
            postings = [self.postings[term_id] for term_id in term_ids]
            p1 = iter(postings[0])
            p2 = iter(postings[1])
            search_results.append(next(p1))
            while p1 != StopIteration and len(search_results) != 0:
                set(search_results).intersection(set([next(p2)]))
        return search_results

def index(filename):
    if os.path.exists(filename): #checks if path is valid
        indexed_tweets = Inv_idx(filename) #creates class Inverted_index from the file found at the path
        indexed_tweets.get_postings() #processes the tweet corpus
    else:
        raise IOError('Path is not correct.')
    return indexed_tweets


def tokenize_normalize(sentence):
    sentence = sentence.split() # splitting at whitespaces
    sentence = [normalize(word) for word in sentence] # normalize each token 
    return sentence

def normalize(word):
    """Removes stopwords and some non-alpahnumeric characters that are deemed irrelevant for our purposes."""
    stopwords = ['I','a','about','an','are','as','at','be','by','com','for','from','how','in','is' ,'it' ,'of' ,'on' ,'or' ,'that','the' ,'this','to' ,'was' ,'what' ,'when','where','who' ,'will' ,'with','the','www']
    if word in stopwords: 
        word = ''
    else:
        word = re.sub(r'[\&/\-\(\)\|\,\]\[]+', ' ', word)
        word = re.sub(r'https://t\.co/.*', ' ', word)
        word = re.sub(r'\s\s+', ' ', word)
        word = word.lower()
    return word

if __name__ == "__main__":
    filename = sys.argv[1]
    indexed_tweets = index(filename)
    print(indexed_tweets.query('stuttgart', 'bahn'))

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).