Demo entry 6648075

main

   

Submitted by anonymous on Oct 22, 2017 at 18:51
Language: Python 3. Code size: 7.5 kB.

"""
INVESTIGATING ORTHOGRAPHIC VARIATION IN JAPANESE

Author: Anthony Tan
Matric. No.: U1540821E
Course: HG2051 Language and the Computer
Date created: 24 September 2017
Date last modified: 23 October 2017
Version: Python 3.5.4

Broadly, this program does three things:
    (1) Retrieve all the available Japanese synsets and try to categorise the synset's lemmas according their orthographic variation.
    (2) Output all the results from (1) to various text files, according to their orthographic categorisation.
"""

import nltk
import re
from nltk.corpus import wordnet as wn
from nltk.metrics import edit_distance
from orthographic_ninjitsu import *
import time
    
def take_first(elem):
    """Take the second element for sort.
    """
    return  elem[0]

def main():
    """Categorise and record orthographic variation in the Japanese language.
    """
    ### all my vars
    all_vars = []
    kanji_vars = []
    sokuon_vars = []
    nakaguro_vars = []
    long_vowel_vars = []
    odoriji_vars = []
    honorable_vars = []
    gonorable_vars = []
    okurigana_vars = []
    tzuzu_vars = []
    ou_vars = []

    my_vars = {
        "all_vars" : all_vars, # all variants
        "kanji_vars" : kanji_vars, # kanji-kanji variants
        "sokuon_vars" : sokuon_vars, # sokuon variants
        "nakaguro_vars" : nakaguro_vars, # nakaguro variants
        "long_vowel_vars" : long_vowel_vars, # long vowel kana variants
        "odoriji_vars" : odoriji_vars, # odoriji variants
        "honorable_vars" : honorable_vars, # o- variants (honorifics)
        "gonorable_vars" : gonorable_vars, # go- variants (honorifics)
        "okurigana_vars" : okurigana_vars, # okurigana variants
        "tzuzu_vars" : tzuzu_vars,  # tzu vs. zu variants
        "ou_vars" : ou_vars # o to u variants
        }

    kanji_pairs = [] # unique kanji-kanji variants

    ### counters for empty/non-empty Japanese synsets
    jap_lemmas = 0
    jap_synsets = 0
    empty_jap_synsets = 0
    my_empty_jap_synsets = []

    ### setup parameters
    lng = "jpn" # set language
    maxdist = 4

    ### get all the synsets
    print("Getting all the synsets...")
    all_synsets = list(wn.all_synsets())

    print("[+] {} English synsets.".format(len(all_synsets))) # size of synsets

    ### check the synsets
    print("Checking the synsets...")
    for synset in all_synsets:
        lemmas = synset.lemma_names(lang=lng)

        ### check/count for presence of Japanese synsets and lemmas
        if len(lemmas) == 0: # seen an empty synset
            empty_jap_synsets += 1
            my_empty_jap_synsets.append(synset.name())
        else: # seen synset with Japanese lemmas
            jap_synsets += 1
            jap_lemmas += len(lemmas) # number of lemmas

        ### find orthographic variation
        for l1 in lemmas:
            for l2 in lemmas:
                edist = edit_distance(l1,l2)

                # check if they are similar
                if l1 != l2 and l1 > l2 and edist <= maxdist: # try to categorise; edist of 1
                    cat = "different"

                    # small kana tsu
                    if ninja_char(l1,l2,"ッ"):
                        cat = "-ッ- mark"
                        sokuon_vars.append((synset,l1,l2,cat))

                    # interpunct/nakaguro
                    elif ninja_char(l1,l2,"・"):
                        cat = "・ mark"
                        nakaguro_vars.append((synset,l1,l2,cat))

                    # long vowels
                    elif ninja_char(l1,l2,"ー"):
                        cat = "long vowel"
                        long_vowel_vars.append((synset,l1,l2,cat))
                            
                    # duplication with odoriji
                    elif odorijutsu(l1,l2):
                        cat = "-々 mark"
                        odoriji_vars.append((synset,l1,l2,cat))

                    # exalted o- prefix
                    elif ojutsu(l1,l2):
                        cat = "お- pfx"
                        honorable_vars.append((synset,l1,l2,cat))

                    # exalted go- prefix:
                    elif gojutsu(l1,l2):
                    	cat = "ご- pfx"
                    	gonorable_vars.append((synset,l1,l2,cat))

                    # okurigana variants
                    elif okurijutsu(l1,l2):
                        cat = "okurigana"
                        okurigana_vars.append((synset,l1,l2,cat))

                    # different kanji
                    elif kanjijutsu(l1,l2):
                        kanji_var = get_kanjijutsu(l1,l2)
                        cat = "{}/{}:kanji".format(kanji_var[0],kanji_var[1])
                        kanji_vars.append((synset,l1,l2,cat))
                        kanji_pairs.append(kanji_var)
                        
                    # tzu vs. zu
                    elif tzuzujutsu(l1,l2):
                        cat = "づ/ず:kana"
                        tzuzu_vars.append((synset,l1,l2,cat))

                    # traditional
                    elif oujutsu(l1,l2):
                        cat = "お/う:kana"
                        ou_vars.append((synset,l1,l2,cat))

                    # store the result (any kind of variant)
                    all_vars.append((synset,l1,l2,cat))
    
    ## output (on Japanese synsets/lemmas)
    print("[+] {} Synsets with Japenese lemmas.".format(jap_synsets))
    print("[+] {} Japanese lemmas found.".format(jap_lemmas))
    print("[-] {} Synsets with no Japanese lemmas.".format(empty_jap_synsets))
    
    ### clear any previously printed results in summary.txt
    clear_summary = open("summary.txt", mode="w")
    print("Results:", file=clear_summary) 
    clear_summary.close()

    ### output (main results)
    print()
    print("Writing results to various files...")

    all_vars_found = len(all_vars) # total number of variants found
 
    for fname, var in sorted(my_vars.items()):
        out = open(fname+".txt",  mode="w", encoding="utf8")
        summary = open("summary.txt", mode ="a", encoding="utf8")

        vars_found = len(var) # number of specific variants found
        percent_var = vars_found/all_vars_found*100 # percentage of variant in relation to total variants found

        # print to interpreter
        print("\t{} results ({}%) for {}".format(
            vars_found, percent_var, fname))

        # print to summary file
        print("{}\t{}, ({}%)".format(
            vars_found, percent_var, fname), file=summary) 

        # print to specific var file
        print("{} results".format(len(var)), file=out)
        for (synsets,l1,l2,cat) in var:
            print("\t".join([synsets.name(),l1,l2,cat]), file=out)
    out.close()
    summary.close()

    ### output (unique kanji-kanji variants)
    kanji_pairs = set(kanji_pairs) # get unique kanji pairs
    kanji_pairs = list(kanji_pairs) # convert to list
    kanji_pairs = sorted(kanji_pairs, key=take_first) # sort list by first kanji in the kanji pair

    out=open("kanji_pairs.txt", mode="w", encoding="utf8")
    print("\t{} results for kanji_pairs".format(len(kanji_pairs)))
    print(len(kanji_pairs), "pairs", file=out)
    for kanji_pair in kanji_pairs:
        print(kanji_pair, file=out)
    out.close()

### ザ・ワールド!
main() # 時よ止まれ! 
print() # そして、時は動き出す。
print("WRYYYYYYYYYYYYYYYYYYY!!")

This snippet took 0.01 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).