Demo entry 6784941

Sample

   

Submitted by anonymous on Mar 11, 2019 at 23:13
Language: Python 3. Code size: 1.1 kB.

ocr_files = sorted(os.listdir(ocr_dir))
df_extracted_dates = pd.DataFrame(columns=['date_string', 'year', 'month', 'day'], index=ocr_files)

for file in tqdm_notebook(ocr_files):
    tsv_file = os.path.join(ocr_dir, file)
    df = pd.read_csv(tsv_file, sep=r'\t', dtype={'text': str}, engine='python')
    df = df[df.conf>-1]  # remove empty words
    df.dropna(subset=['text'], inplace=True)
    if df.empty:
        continue 
    dates_found = find_dates(df)
    if not dates_found.empty:
        idx_topmost_date = dates_found['top'].idxmin()
        df_extracted_dates.loc[f, 'month'] = dates_found.loc[idx_topmost_date, 'month']
        df_extracted_dates.loc[f, 'day'] = dates_found.loc[idx_topmost_date, 'day']
        df_extracted_dates.loc[f, 'year'] = dates_found.loc[idx_topmost_date, 'text']
        day_string = dates_found.loc[idx_topmost_date, 'day'] 
        date_string = df.loc[idx_topmost_date-2:idx_topmost_date, 'text'].str.cat(sep=' ')
        df_extracted_dates.loc[f, 'date_string'] = date_string

output_file = image_dir + '_results.csv'
df_extracted_dates.to_csv(output_file)

This snippet took 0.00 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).