# encoding: utf-8 import sys from difflib import SequenceMatcher import io import re import argparse import urllib.request import urllib.parse import xml.etree.ElementTree as ET from descendants import get_PG_decendants, get_descendants globalvars = { "escape": False, "data": { "words": [ {"id": 0, "language": "gem", "lemma": "bītaną"}, {"id": 1, "language": "gem", "lemma": "hrītaną"}, {"id": 2, "language": "deu", "lemma": "beißen"}, {"id": 3, "language": "deu", "lemma": "reißen"}, {"id": 4, "language": "nld", "lemma": "bijten"}, {"id": 5, "language": "nld", "lemma": "rijten"}, {"id": 6, "language": "deu", "lemma": "scheißen"}, {"id": 7, "language": "gem", "lemma": "skītaną"}, {"id": 8, "language": "nld", "lemma": "schijten"}, ], "origins": [ {"from": 0, "to": 2}, {"from": 1, "to": 3}, {"from": 0, "to": 4}, {"from": 1, "to": 5}, {"from": 7, "to": 6}, {"from": 7, "to": 8}, ] } } def myformat(string): global globalvars return (string.encode("unicode_escape") if globalvars["escape"] else string) def compare(wordpair): """Compare two words of the same language and find the biggest matching block. e.g. PG bītaną and hrītaną share "ītaną" """ analyzation = SequenceMatcher(None, wordpair[0]["lemma"], wordpair[1]["lemma"]) print("\ncomparing '%s' and '%s' ..." % (myformat(wordpair[0]["lemma"]), myformat(wordpair[1]["lemma"]))) blocks = analyzation.get_matching_blocks() for block in blocks: print("\tcurrent block: %s" % (str(block))) for index in range(len(wordpair)): word = wordpair[index] size = block[2] if (size > 0): begin = block[index] end = begin+size print("\t\t'%s'[%u:%u] = %s" % (myformat(word["lemma"]), begin, end, myformat(word["lemma"][begin:end]))) def main(args): global globalvars parser = argparse.ArgumentParser() parser.add_argument("--escape", dest="escape", action="store_const", const=sum, default=False, help="enable escape") parameters = parser.parse_args(args) globalvars["escape"] = parameters.escape if (len(args) > 0): escape = False for i in range(0, len(globalvars["data"]["words"])): word1 = globalvars["data"]["words"][i] for j in range(i+1, len(globalvars["data"]["words"])): word2 = globalvars["data"]["words"][j] if (word1["language"] == word2["language"]): #if languages match wordpair = (word1, word2) compare(wordpair) #compare the pair result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną"))) print(str(result)) main(sys.argv[1:])