Demo entry 3662244

Inverted Lists

   

Submitted by anonymous on Feb 17, 2016 at 13:55
Language: Python. Code size: 5.1 kB.

"""
Copyright 2015, University of Freiburg.

Hannah Bast <bast@cs.uni-freiburg.de>
Björn Buchhold <buchhold@cs.uni-freiburg.de>
"""

import sys
import re


class InvertedIndex:
    """ A simple inverted index.

    >>> ii = InvertedIndex()
    >>> ii.build_from_file("example.txt")
    >>> sorted(ii.inverted_lists.items())
    [('document', [(1, 1), (2, 1), (3, 1)]), ('first', [(1, 1)]), \
('second', [(2, 1)]), ('third', [(3, 1)])]
    """

    def __init__(self):
        """ Create an empty inverted index. """
        self.inverted_lists = dict()
        self.snippets = ['no doc 0']
        self.titles = ['doc0']

    def build_from_file(self, file_name):
        """ Build index for text in given file, one record per line.
        >>> ii = InvertedIndex()
        >>> ii.build_from_file("example2.txt")
        >>> sorted(ii.inverted_lists.items())
        [('one', [(1, 1)]), ('two', [(2, 2)])]
        """
        with open(file_name, "r") as file:
            doc_id = 0
            for line in file:
                doc_id += 1
                self.titles.append(line.strip().split('\t')[0])
                self.snippets.append(" ".join(line.strip().split('\t')[1:]))
                for word in re.split("\W+", line):
                    word = word.lower()
                    if len(word) < 1:
                        continue
                    if word not in self.inverted_lists:
                        self.inverted_lists[word] = [(doc_id, 1)]
                    elif self.inverted_lists[word][
                            len(self.inverted_lists[word]) - 1][0] == doc_id:
                        i = len(self.inverted_lists[word]) - 1
                        self.inverted_lists[word][i] = (
                            doc_id,
                            self.inverted_lists[word][i][1] + 1)
                    else:
                        self.inverted_lists[word].append((doc_id, 1))

    def intersect(self, a, b):
        """ Intersect two posting lists
        >>> ii = InvertedIndex()
        >>> a = [(1, 1), (2, 2)]
        >>> b = [(2, 1)]
        >>> ii.intersect(a, b)
        [(2, 3)]
        """

        res = []
        i = 0
        j = 0
        while i < len(a) and j < len(b):
            if a[i][0] < b[j][0]:
                i += 1
            elif a[i][0] > b[j][0]:
                j += 1
            else:
                res.append((a[i][0], a[i][1] + b[j][1]))
                i += 1
                j += 1
        return res

    def merge(self, a, b):
        """ Merge two posting lists
        >>> ii = InvertedIndex()
        >>> a = [(1, 1), (2, 2)]
        >>> b = [(2, 1)]
        >>> ii.merge(a, b)
        [(1, 1), (2, 3)]
        """

        res = []
        i = 0
        j = 0
        while i < len(a) and j < len(b):
            if a[i][0] < b[j][0]:
                res.append(a[i])
                i += 1
            elif a[i][0] > b[j][0]:
                res.append(b[j])
                j += 1
            else:
                res.append((a[i][0], a[i][1] + b[j][1]))
                i += 1
                j += 1
        while i < len(a):
            res.append(a[i])
            i += 1
        while j < len(b):
            res.append(b[j])
            j += 1
        return res

    def process_query(self, query, union=False):
        """ Process a multi word query
        >>> ii = InvertedIndex()
        >>> ii.build_from_file("example.txt")
        >>> ii.process_query("first document")
        [(1, 2)]

        """
        terms = query.split(' ')
        combine = self.intersect if not union else self.merge
        if len(terms) < 1:
            return []
        else:
            if terms[0] in self.inverted_lists:
                collected = self.inverted_lists[terms[0]]
            else:
                collected = []
            for i in range(1, len(terms)):
                if terms[i] in self.inverted_lists:
                    collected = combine(collected,
                                        self.inverted_lists[terms[i]])
        return collected

    def print_output(self, query, agg_list):
        if not agg_list:
            print("Empty result.")
        regex = '\\b(' + query.replace(' ', '|') + ')\\b'
        start = "\033[1m"
        end = "\033[0;0m"
        highlight = lambda text: re.sub(regex, start + '\\1'
                                        + end, text, flags=re.IGNORECASE)
        for hit in sorted(agg_list, key=lambda e: e[1], reverse=True)[:3]:
            print('\t'.join([str(hit[1]), highlight(self.titles[hit[0]]),
                            highlight(self.snippets[hit[0]])]))

if __name__ == "__main__":
    if len(sys.argv) != 2:
        print("Usage: python3 inverted_index.py <file>")
        sys.exit()
    file_name = sys.argv[1]
    ii = InvertedIndex()
    ii.build_from_file(file_name)
    while (True):
        print()
        print()
        q = input("Enter query: ")
        merge = (input("Use merge instead of intersect? [y/n]: ") == "y")
        print()
        ii.print_output(q, ii.process_query(q, merge))

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).