Demo entry 6657818

resolver

   

Submitted by flagmt on Nov 04, 2017 at 22:23
Language: Python 3. Code size: 2.8 kB.

resolved = {}
for filename in os.listdir(full_path):
    base_name, ext = os.path.splitext(filename)
    # topo_list will initially hold all toponyms in one transcript
    # open a transcript and begin processing
    with open(full_path + '\\' + filename, 'r', encoding='latin-1') as full:
        # read in entire file contents as a string
        s = full.read()
        # locate each tagged toponym (tag = <toponym>) in s and place in list
        topo_list = re.findall(r'<(.*?)>', s)
        # remove apostrophes, replace St. with Saint (as it appears in gazetteer and geonames)
        for i, topo in enumerate(topo_list):
            topo_list[i] = topo.replace("'", "")
        for i, topo in enumerate(topo_list):
            topo_list[i] = topo.replace("St.", "Saint")
        for i, topo in enumerate(topo_list):
            topo_list[i] = topo.replace("Ft.", "Fort")
        # create another dictionary keyed on the toponyms from a single transcript
        # this also has the effect of implementing the 'one sense per document' heuristic
        # since dictionary keys are unique
        doc_topos = {key: None for key in topo_list}
        for topo in doc_topos.keys():
            # if already resolved , move on to the next toponym
            if doc_topos[topo]:
                continue
            # check if toponym is in resolved, if yes, assign coordinates from resolved
            elif topo in resolved.keys():
                doc_topos[topo] = resolved[topo]
                continue
            # toponym is not already resolved, so we must resolve it
            else:
                # use sieve heuristic to catch well-known toponyms
                doc_topos[topo] = get_coords(topo)
            # if get_coords unable to resolve toponym, coords = None and toponym is sent to wiki function
            # get candidates from gazetteer and calculate weights to select best candidate
            if not doc_topos[topo]:
                selected = wiki(topo)
                location = selected.get()
                doc_topos[topo] = (location[1][1], location[1][2])
            # toponym has been resolved to a set of coordinates and these are added to corpus-level dictionary resolved
            resolved[topo] = doc_topos[topo]
    # create a kml file for each transcript
    kml = simplekml.Kml()
    for key, val in doc_topos.items():
        kml.newpoint(name=key, coords=[(val[1], val[0])])
    kml.save('kml\\' + base_name + '.kml')
print('writing topos.txt...')
# write all toponyms and their resolved coordinates to a file
with open('resolved_topos.txt', 'w', encoding='latin-1') as outfile:
    for k, v in resolved.items():
        outfile.write(k + ',' + str(v[0]) + ',' + str(v[1]) + '\n')

This snippet took 0.00 seconds to highlight.

Back to the Entry List or Home.

Delete this entry (admin only).