Demo entry 2352680

nothing

   

Submitted by anonymous on Aug 07, 2015 at 23:15
Language: Python. Code size: 4.5 kB.

__author__ = 'VKrslak'
import os
import gzip
import urllib.parse
import time
from collections import defaultdict
import os


def load_files(path):
    files_inner = []
    for (dirpath, dirnames, filenames) in os.walk(path):    # read all the files in the given directory
        if filenames is not None and dirpath is not "/":    # if the file has a name (not a directory)
            files_inner.extend(                             # add it to the list
                [filename for filename in filenames if filename.startswith("pagecounts-")])
    return files_inner


def uncompress_and_read(file_path, file_name, search_terms):
    reader = gzip.open(file_path + file_name, "rb") # get a handle to the file with gzip
    results = defaultdict(int) # use this to save results
    time1 = time.time()
    text = reader.readlines() # read file into memory
    time2 = time.time()
    print("Time to load file: %.2f seconds" % (time2 - time1))
    for index, line in enumerate(text):
        clear_line = line.decode("UTF-8", errors='ignore') # decode the byte data
        if clear_line.startswith("en "):                   # get only english articles

            unquoted_line = urllib.parse.unquote(clear_line)[3::].split() # turn the url percentage coding into normal english, remove the "en " from the beginning, split on space
            name = '_'.join(unquoted_line[0:-2]) # sometimes the name get's split too, patch it up and use the underscore instead of space
            views = unquoted_line[-2]           # the second to last number are the views
            if name in search_terms:            # if article in search terms
                    stored_view_count = results[name]   # get the previous count (will return 0 if nothing there yet)
                    if stored_view_count < int(views):  # if it was smaller, we didn't get the main page
                        results[name] = int(views)      # replace with higher number


    reader.close()
    time3 = time.time()
    print('Time to process file %.2f seconds' %(time3 - time2))
    return results

def check_log(save_folder): # check to see if this is an interrupted search
    f = []
    scanned_files = []
    for (dirpath, dirnames, filenames) in os.walk(save_folder):
        f.extend(filenames)
        break
    if "log.txt" in f:
        with open(save_folder+"log.txt", 'r') as source:
            scanned_files = list(map(str.strip, source.readlines()))
    return scanned_files

def main():
    search_terms_file = "search_terms.txt"
    files_dir = "data\\" # path to data
    save_folder = "saved_search2\\" # path to save_folder (This must exist, otherwise error)
    searching_for = list(get_terms_from_file(search_terms_file)) # enter search terms here (case sensitive, replace spaces
    scanned_files = check_log(save_folder)
    print(scanned_files)
    if not scanned_files:
        for name in searching_for:
            with open("%s%s.csv" % (save_folder, name.replace(':', '')), 'w') as destination: # create a file for each term, replace colon to be able to create file
                destination.write("Query,timestamp,Page,visits,language\n")
        with open(save_folder+"log.txt", 'w') as temp: # just create the log file, don't write anything
            pass

    all_files = load_files(files_dir) # get the files
    number_of_files = len(all_files)

    print(searching_for)


    for index, current_file in enumerate(all_files):
        if current_file not in scanned_files:
            time_stamp = current_file[11:-3] # get the timestamp from the name
            results = uncompress_and_read(files_dir, current_file, searching_for) # crawl the file
            print(results)
            for name in searching_for:
                corrected_name = name.replace(':', '')
                with open("%s%s.csv" % (save_folder, corrected_name), 'a') as destination:
                    destination.write("%s,%s,%s,EN\n" % (name, time_stamp, results[name]))
            print("Finished file %s of %s" % (index+1, number_of_files))
            with open(save_folder+"log.txt", 'a') as log_file:
                log_file.write(current_file+"\n")
        else:
            print("File %s already scanned, skipping..." % current_file)

def get_terms_from_file(file_name):
    with open(file_name) as source:
        return map(str.strip, source.readlines())

if __name__ == '__main__':
    main()

This snippet took 0.00 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).