source/tools/correlation/test.py (fedd813) - fs-workbench.git

fedd8139d2e74f2832e87d121b414005a051094b

separated the descendants a... Raymund Zacharias authored 6 years ago	`1) # encoding: utf-8 2) 3) import sys 4) from difflib import SequenceMatcher 5) import io 6) import re 7) import argparse 8) import urllib.request 9) import urllib.parse 10) import xml.etree.ElementTree as ET 11) from descendants import get_PG_decendants, get_descendants`
implemented ancestor lookup Raymund Zacharias authored 6 years ago	`12) from ancestor import get_ancestor`
separated the descendants a... Raymund Zacharias authored 6 years ago	13) 14) globalvars = { 15) "escape": False, 16) "data": { 17) "words": [ 18) {"id": 0, "language": "gem", "lemma": "bītaną"}, 19) {"id": 1, "language": "gem", "lemma": "hrītaną"}, 20) {"id": 2, "language": "deu", "lemma": "beißen"}, 21) {"id": 3, "language": "deu", "lemma": "reißen"}, 22) {"id": 4, "language": "nld", "lemma": "bijten"}, 23) {"id": 5, "language": "nld", "lemma": "rijten"}, 24) {"id": 6, "language": "deu", "lemma": "scheißen"}, 25) {"id": 7, "language": "gem", "lemma": "skītaną"}, 26) {"id": 8, "language": "nld", "lemma": "schijten"}, 27) ], 28) "origins": [ 29) {"from": 0, "to": 2}, 30) {"from": 1, "to": 3}, 31) {"from": 0, "to": 4}, 32) {"from": 1, "to": 5}, 33) {"from": 7, "to": 6}, 34) {"from": 7, "to": 8}, 35) ] 36) } 37) } 38) 39) 40) def myformat(string): 41) global globalvars 42) return (string.encode("unicode_escape") if globalvars["escape"] else string) 43) 44) 45) def compare(wordpair): 46) """Compare two words of the same language and find the biggest matching block. 47) e.g. PG bītaną and hrītaną share "ītaną" 48) """ 49) analyzation = SequenceMatcher(None, wordpair[0]["lemma"], wordpair[1]["lemma"]) 50) print("\ncomparing '%s' and '%s' ..." % (myformat(wordpair[0]["lemma"]), myformat(wordpair[1]["lemma"]))) 51) blocks = analyzation.get_matching_blocks() 52) for block in blocks: 53) print("\tcurrent block: %s" % (str(block))) 54) for index in range(len(wordpair)): 55) word = wordpair[index] 56) size = block[2] 57) if (size > 0): 58) begin = block[index] 59) end = begin+size 60) print("\t\t'%s'[%u:%u] = %s" % (myformat(word["lemma"]), begin, end, myformat(word["lemma"][begin:end]))) 61) 62) 63) def main(args): 64) global globalvars 65) parser = argparse.ArgumentParser() 66) parser.add_argument("--escape", dest="escape", action="store_const", const=sum, default=False, help="enable escape") 67) parameters = parser.parse_args(args) 68) globalvars["escape"] = parameters.escape 69) if (len(args) > 0): 70) escape = False 71) for i in range(0, len(globalvars["data"]["words"])): 72) word1 = globalvars["data"]["words"][i] 73) for j in range(i+1, len(globalvars["data"]["words"])): 74) word2 = globalvars["data"]["words"][j] 75) if (word1["language"] == word2["language"]): #if languages match 76) wordpair = (word1, word2) 77) compare(wordpair) #compare the pair 78) result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną"))) 79) print(str(result))
implemented ancestor lookup Raymund Zacharias authored 6 years ago	`80) ancestor_word = get_ancestor("white", "en", "gem-pro")[0]["lemma"] 81) ancestor_word = ancestor_word.lstrip("*") 82) print(get_PG_decendants(ancestor_word))`