Demo entry 6648093

python

   

Submitted by anonymous on Oct 23, 2017 at 06:23
Language: Python 3. Code size: 8.8 kB.

## -*- coding: utf-8 -*-


"""
This program aims to find the orthographic variants in French.
After running it, a file named "variants.txt" will be created in the same folder
as this Python prgramme file,
in which you will find data about orthographic variants in French specifically.

It is for HG2051. Lecturer: Assoc.Prof.Francis Bond
@author: Liu Junling

"""

## initilization: 
## import required modules 

import nltk

from nltk.corpus import wordnet as wn
from nltk.metrics import edit_distance

## set target language and maximum distance,
lng = 'fra'
maxdist = 2

## get all synsets
all_synsets=list(wn.all_synsets())


## during the process of replacing and checking,
## the following characters cannot be processed properly,
## therefore string method maketrans is used to process them
transi = str.maketrans("î", "i")
transu = str.maketrans("û", "u")
transquo = str.maketrans("’", "'")
transa = str.maketrans("â", "a")


## create lists of the found patterns
## label them by their general category
allcat = ['accent', 'simpli', 'pron', 'hyphen', 'space',
          'case', 'prefix', 'suffix', 'punc', 'diff']

for cat in allcat:
    a = '%s = []' %cat
    exec (a)

    
## define a function for checking 's_' in multi-words
def finds(l1, l2):
    """"Find pairs that are multiwords and only differ in the first word: \
        one is plural, with suffix s, the other is singular, without suffix"""
    words1 = l1.split('_')
    words2 = l2.split('_')
    if len(words1) > 1:
        if words1[0][-1] == 's':
            return True
    else:
        return False



## start checking synsets
allpairs = []
for ss in all_synsets: 
    lemmas= ss.lemma_names(lang=lng)
    for l1 in lemmas:
        for l2 in lemmas:
            # check if they are similar
            if l1 > l2 and edit_distance(l1,l2) < maxdist:
                # get a list of all the pairs, so as to count the total number
                # of pairs later
                allpairs.append((l1,l2))
                # try to categorize
                # cat only refers to the subcategory of the pairs.
                # words are first given a subcatgory, 
                # and then added to a list of the general category
                if l1 == l2 + 'e':
                    cat = 'fem/masc: -e/'
                    suffix.append((ss,l1,l2,cat))
                elif l1[-1] == 'e' and l2[-1] == 'a':
                    cat = 'fem/masc: -e/-a'
                    suffix.append((ss,l1,l2,cat))
                elif l1 == l2 + 's':
                    cat = 'pl/sg: -s/'
                    suffix.append((ss,l1,l2,cat))
                elif l1 == l2 + 'r':
                    cat = '1/3: -r/'
                    suffix.append((ss,l1,l2,cat))
                elif l1 == l2[0].lower() + l2[1:]:
                    cat = 'lower/upper'
                    case.append((ss,l1,l2,cat))
                elif l1.lower()==l2.lower():
                    cat = '_lower/_upper'
                    case.append((ss,l1,l2,cat))
                elif l1 == l2.replace('-',''):
                    cat = '/-'
                    hyphen.append((ss,l1,l2,cat))
                elif l1.replace('_','-') == l2.replace('_','-'):
                    cat = '_/-'
                    hyphen.append((ss,l1,l2,cat))
                elif l1 == l2.replace('_',''):
                    cat = '/_'
                    space.append((ss,l1,l2,cat))
                elif l1 == l2.replace('è','é'):
                    cat = 'é/è'
                    accent.append((ss,l1,l2,cat))
                elif l1.replace('é','e') == l2:
                    cat = 'é/e'
                    accent.append((ss,l1,l2,cat))
                elif l1.translate(transu) == l2:
                    cat = 'circumflex: û/u'
                    accent.append((ss,l1,l2,cat))
                elif l1.translate(transi) == l2:
                    cat = 'circumflex: î/i'
                    accent.append((ss,l1,l2,cat))
                elif l1.translate(transa) == l2:
                    cat = "circumflex: â/a"
                    accent.append((ss,l1,l2,cat))
                elif l1.translate(transquo) == l2:
                    cat = "’/'"
                    punc.append((ss,l1,l2,cat))
                elif l1 == l2 + 't':
                    cat = 'masc/fem: -t/'
                    suffix.append((ss,l1,l2,cat))
                elif l1.replace('h','') == l2.replace('h',''):
                    cat = 'h'
                    pron.append((ss,l1,l2,cat))
                elif l1 == 'r' + l2 or 'r' + l1 == l2:
                    cat = 'r-'
                    prefix.append((ss,l1,l2,cat))
                elif l1 == 'é' + l2 or 'é' + l1 == l2:
                    cat = 'é-'
                    prefix.append((ss,l1,l2,cat))
                elif l1 == 'a' + l2 or 'a' + l1 == l2:
                    cat = 'a-'
                    prefix.append((ss,l1,l2,cat))
                elif l1 == l2.replace('y','i') or l1.replace('y','i') == l2:
                    cat = 'y/i'
                    pron.append((ss,l1,l2,cat))
                elif l1 == l2.replace('ll','l') or l1.replace('ll','l') == l2:
                    cat = 'll/l'
                    simpli.append((ss,l1,l2,cat))
                elif l1 == l2.replace('tt','t') or l1.replace('tt','t') == l2:
                    cat = 'tt/t'
                    simpli.append((ss,l1,l2,cat))
                elif l1 == l2.replace('nn','n') or l1.replace('nn','n') == l2:
                    cat = 'nn/n'
                    simpli.append((ss,l1,l2,cat))
                elif l1 == l2.replace('pp','p') or l1.replace('pp','p') == l2:
                    cat = 'pp/p'
                    simpli.append((ss,l1,l2,cat))
                elif l1 == l2.replace('bb','b') or l1.replace('bb','b') == l2:
                    cat = 'bb/b'
                    simpli.append((ss,l1,l2,cat))
                elif l1 == l2.replace('rr','r') or l1.replace('rr','r') == l2:
                    cat = 'rr/r'
                    simpli.append((ss,l1,l2,cat))
                elif l1.replace('ier', 'er') == l2:
                    cat = '-ier/-er'
                    suffix.append((ss,l1,l2,cat))
                elif l1 == l2.replace('k','c') or l1.replace('k','c') == l2 \
                     or l1 == l2 + 'k':
                    cat = 'k/c'
                    pron.append((ss,l1,l2,cat))
                elif l1 == l2.replace('ck', 'k'):
                    cat = 'k/ck'
                    pron.append((ss,l1,l2,cat))
                elif l1.replace('q', 'k') == l2:
                    cat = 'q/k'
                    pron.append((ss,l1,l2,cat))
                elif l1 == l2 + 'x':
                    cat = 'pl/sg: x/'
                    suffix.append((ss,l1,l2,cat))
                elif l1 == l2.replace('es','s'):
                    cat = 'masc/fem: /-e'
                    suffix.append((ss,l1,l2,cat))
                elif finds(l1, l2):
                    cat = 'pl/sg: s_/'
                    suffix.append((ss,l1,l2,cat))
                else:
                    cat = 'different'
                    diff.append((ss,l1,l2,cat))



## print out the results to a file
out = open('variants.txt', mode='w', encoding='utf8')

pairnum = len(allpairs)
print ('There are %s word pairs in total.' %pairnum, file=out)
print ('\n', file=out)
print ('There are ten categories defined.', file=out)
print ('They are: accent, simpli, pron, hyphen, space, ' +
       'case, prefix, suffix, punc, diff. ', file = out)
print ('\n\n', file=out)

for cat in allcat:
    # create a buffer list
    a = "catlist = %s" %cat
    exec (a)
    # count the total number of word pairs in a given category
    count = len(catlist)
    print ('%s%s%s' %('*'*15 + 'Category: '+ cat +'*'*15, \
                    'Total: ' + str(count) + '*'*15,
                      'Percentage: ' + str((count*100/pairnum)) 
                                  + '%' + '*'*15), file=out)
    print ('\n\n', file=out)
    # count the number of word pairs in a subcategory
    subcat = []
    for (ss,l1,l2,sc) in catlist:
        subcat.append(sc)
    subcat = set(subcat)
    scdic = {}
    for c in subcat:
        for (ss,l1,l2, sc) in catlist:
            if sc == c:
                scdic.setdefault(c,[]).append([ss,l1,l2, sc])
    for key in scdic.keys():
        print ('%-30s%s' %('-----Subcategory: '+ key, 'Total: ' \
                           + str(len(scdic[key])) + '-----'), file=out)
        # get the word pairs of a subcategory
        allinfo = scdic[key]
        for info in allinfo:
            print ('%-30s%-30s%-30s%-5s' %(info[0].name(),info[1],\
                                           info[2],info[3]), file=out)
        print ('\n\n', file=out)
    print ('\n\n\n\n', file=out)



## close the file   
out.close()

This snippet took 0.02 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).