Demo entry 6663460

social data analysis

   

Submitted by anonymous on Nov 30, 2017 at 04:55
Language: Python 3. Code size: 3.8 kB.

import csv
from datetime import datetime
import matplotlib.pyplot as plt
import collections

e_name={}
yearly_cohort_dict ={}
cohort_dict = {}

# Read the dataset
with open('urwiki.csv', 'r', encoding='utf-8', errors='ignore', newline='') as f:
    reader = csv.DictReader(f)

    for idx, row in enumerate(reader):
        new_editor_name=row['editor_id']
        new_article_id=row['articleid']
        edit_time = datetime.strptime(row['date_time'], "%m/%d/%Y") 
        
        if new_editor_name not in e_name.keys():
            e_name[new_editor_name] = []
        e_name[new_editor_name].append([new_article_id, edit_time])

# Calculate the number of edits
print('the number of editors is ' + str(len(set(e_name.keys()))))

#  Grouping edits by cohort year
# e_name dict => { editor_id: [article_id, edit_time], ... }
for editor, edit_list in e_name.items():
    editor_cohort = min([edit[1] for edit in edit_list])
    editor_cohort = editor_cohort.year
    
    if editor_cohort not in yearly_cohort_dict.keys():
        yearly_cohort_dict[editor_cohort] = {}
        
    yearly_cohort_dict[editor_cohort][editor] = edit_list

# Initialize a dictionary with keys of year/month timestamp
for cohort_year in yearly_cohort_dict.keys():
    cohort_dict[cohort_year] = {}
    for year in range(2004, 2015):
        for month in range(1, 13):
            cohort_dict[cohort_year][str(year) + '-' + str(month)] = [[], 0]

# Count the number of edits per year/month for each cohort
for cohort_year, editor_edit_dict in yearly_cohort_dict.items():
    # {editor_id: [ [article_id, edit_time],  ... ] }
    for editor, edit_list in editor_edit_dict.items():
        for edit in edit_list:
            edit_time = edit[1]
            edit_time_year = str(edit_time.year) + '-' + str(edit_time.month)
            cohort_dict[cohort_year][edit_time_year][1] += 1
            if editor not in cohort_dict[cohort_year][edit_time_year][0]:
                cohort_dict[cohort_year][edit_time_year][0].append(editor)
    ### Calculate the yearly statistics
    editor_list_per_cohort = [editor for editor, edit_list in editor_edit_dict.items()]
    unique_article_list_per_cohort = []
    edit_list_per_cohort = []
    for editor, edit_list in editor_edit_dict.items():
        for edit in edit_list:
            article_id = edit[0]
            edit_list_per_cohort.append(article_id)
    unique_article_list_per_cohort = set(edit_list_per_cohort)
    print(cohort_year)
    print(len(editor_list_per_cohort))
    print(len(unique_article_list_per_cohort))
    print(len(edit_list_per_cohort))
    print()

# year: cohort year, cohort_year: years in cohort year
# user_edit_count_list => 0: user_count, 1: user_
for year, cohort_year_dict in cohort_dict.items():
    for year_month, user_edit_count_list in cohort_year_dict.items():
        user_count = len(user_edit_count_list[0])
        if user_count:
            user_edit_count_list.append(user_edit_count_list[1] / user_count)
        else:
            user_edit_count_list.append(0)

# Order by time
for year, cohort_year_dict in cohort_dict.items():
    cohort_dict[year] = collections.OrderedDict(sorted(cohort_year_dict.items(), key=lambda t: datetime.strptime(t[0], '%Y-%m')))
    
# As a whole
cohort_dict_total = {}
for year, cohort_year_dict in cohort_dict.items():
    for year_month, count_list in cohort_year_dict.items():
        if year_month not in cohort_dict_total.keys():
            cohort_dict_total[year_month] = 0
        else:
            cohort_dict_total[year_month] += count_list[2]

# Order by time - cohort_dict_total
cohort_dict_total = collections.OrderedDict(sorted(cohort_dict_total.items(), key=lambda t: datetime.strptime(t[0], '%Y-%m')))

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).