Submitted by anonymous on Jun 24, 2017 at 04:32
Language: Python.

#Transforma el corpus en una lista de frases
def corpus_to_sentences(corpus):
    tokenizer ='tokenizers/punkt/french.pickle')
    #1. remover notas del autor
    res = re.sub(ur'\[[0-9]\]',u' ',corpus.decode('utf-8'),re.U)
    #2. Transforma en una lista de frases en minusculas
    res = tokenizer.tokenize(res.lower().strip())
    #3. remover non-letras y non-Unicode símbolos
    res = [re.sub(ur'[\W_]+', u' ', sents, flags=re.UNICODE) for sents in res]
    return res

#Transforma las frases en listas de palabras
def sentences_to_list(sents):
    stopwords_fr = stopwords.words('french')
    return [sent.split() for sent in sents if sent not in stopwords_fr]

#Transforma la lista de frases en un modelo
def text_to_model(text):
    if round(os.path.getsize(path+text)/1000000,1) < 0.5 :
        dim = 48
    elif round(os.path.getsize(path+text)/1000000,1) < 2:
        dim = 72
        dim = 100
    text_file = open(path+text).read()
    text_file = sentences_to_list(corpus_to_sentences(text_file))
    return to_w2v(text_file,dim)

