Demo entry 6786682

preprocess

   

Submitted by anonymous on Apr 03, 2019 at 04:19
Language: Python. Code size: 811 Bytes.

def preprocess_input_json(jfile, preprocessed_dir):
    with open(jfile, "r") as f:
        raw_df = pd.read_json(f, orient='index')
    
    processed_df['token_abs'] = raw_df['abstract'].map(tokenize)
    ohced_main_cat = get_ohced_main_category(raw_df)
    processed_df = pd.concat([processed_df, ohced_main_cat], axis=1)
    
    out_file = os.path.join(preprocessed_dir, os.path.basename(jfile))
    with open(out_file, 'w'):
        processed_df.to_json(out_file, orient='records', lines=True)



def dask_preprocess(input_dir, preprocessed_dir, glob_pattern, category_list):
    input_files = glob.glob(os.path.join(input_dir, glob_pattern))

    json_db = db.from_sequence(input_files) \
        .map(preprocess_input_json, preprocessed_dir, category_list)
    
    json_db.compute()

This snippet took 0.00 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).