implemented ancestor lookup (fedd813) - fs-workbench.git

implemented ancestor lookup

Raymund Zacharias commited on 2017-08-18 05:37:37
Zeige 2 geänderte Dateien mit 52 Einfügungen und 0 Löschungen.

source/tools/correlation/ancestor.py 0000000..56c4f43
source/tools/correlation/test.py c1baae4..5d18ac5

...	...	@@ -0,0 +1,48 @@
	1	+import io
	2	+import re
	3	+import urllib.request
	4	+import urllib.parse
	5	+import xml.etree.ElementTree as ET
	6	+
	7	+
	8	+def get_ancestor(title, language_code, ancestor_language_code):
	9	+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
	10	+ request_parameters = {
	11	+ "action": "query",
	12	+ "format": "xml",
	13	+ "export": 1,
	14	+ "exportnowrap": 1,
	15	+ "titles": title,
	16	+ "redirects": 1
	17	+ }
	18	+ request_data = urllib.parse.urlencode(request_parameters)
	19	+ request_data = request_data.encode('utf8')
	20	+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
	21	+ with urllib.request.urlopen(wikipage_request) as wikipage_response:
	22	+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
	23	+ wikipage_string = wikipage_response.read()
	24	+ root = ET.fromstring(wikipage_string)
	25	+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
	26	+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
	27	+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
	28	+ # ET.dump(root)
	29	+ # print(wikipage_text_string)
	30	+ match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
	31	+ etymology_string = wikipage_text_string[match.end():-1]
	32	+ ancestor_words = []
	33	+ # {{l\|ang\|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "\|"
	34	+ matches = re.findall(r"(?<={{).*?(?=}})",
	35	+ etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
	36	+ for id, match in enumerate(matches):
	37	+ parameters = match.split("\|")
	38	+ # {{etyl\|gem-pro\|en}} {{m\|gem-pro\|*sagô}}
	39	+ if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
	40	+ if id < len(matches):
	41	+ next_match = matches[id + 1]
	42	+ nm_parameters = next_match.split("\|")
	43	+ if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
	44	+ ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
	45	+ # {{inh\|en\|gem-pro\|*hwītaz}}
	46	+ if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
	47	+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
	48	+ return ancestor_words

source/tools/correlation/test.py

Zeige Datei @ fedd813

@@ -9,6 +9,7 @@ import urllib.request
 import urllib.parse
 import xml.etree.ElementTree as ET
 from descendants import get_PG_decendants, get_descendants
+from ancestor import get_ancestor
 
 globalvars = {
     "escape": False,
@@ -76,6 +77,9 @@ def main(args):
                 compare(wordpair) #compare the pair
     result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
     print(str(result))
+    ancestor_word = get_ancestor("white", "en", "gem-pro")[0]["lemma"]
+    ancestor_word = ancestor_word.lstrip("*")
+    print(get_PG_decendants(ancestor_word))
                 
 main(sys.argv[1:])
 


...	...	@@ -9,6 +9,7 @@ import urllib.request
9	9	import urllib.parse
10	10	import xml.etree.ElementTree as ET
11	11	from descendants import get_PG_decendants, get_descendants
	12	+from ancestor import get_ancestor
12	13
13	14	globalvars = {
14	15	"escape": False,
...	...	@@ -76,6 +77,9 @@ def main(args):
76	77	compare(wordpair) #compare the pair
77	78	result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
78	79	print(str(result))
	80	+ ancestor_word = get_ancestor("white", "en", "gem-pro")[0]["lemma"]
	81	+ ancestor_word = ancestor_word.lstrip("*")
	82	+ print(get_PG_decendants(ancestor_word))
79	83
80	84	main(sys.argv[1:])
81	85
82	86