source/tools/correlation/ancestor.py (b3c4063) - fs-workbench.git

b3c4063a7ccae2b875f8b408d5574479f29bc0ad

Raymund Zacharias authored 6 years ago

1) import io
2) import re
3) import urllib.request
4) import urllib.parse
5) import xml.etree.ElementTree as ET
6) 
7) 
8) def get_ancestor(title, language_code, ancestor_language_code):
9)     """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
10)     request_parameters = {
11)         "action": "query",
12)         "format": "xml",
13)         "export": 1,
14)         "exportnowrap": 1,
15)         "titles": title,
16)         "redirects": 1
17)     }
18)     request_data = urllib.parse.urlencode(request_parameters)
19)     request_data = request_data.encode('utf8')
20)     wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
21)     with urllib.request.urlopen(wikipage_request) as wikipage_response:
22)         # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
23)         wikipage_string = wikipage_response.read()
24)         root = ET.fromstring(wikipage_string)
25)         ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
26)         # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
27)         wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
28)         # ET.dump(root)
29)         # print(wikipage_text_string)
30)         match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
31)         etymology_string = wikipage_text_string[match.end():-1]
32)         ancestor_words = []
33)         # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
34)         matches = re.findall(r"(?<={{).*?(?=}})",
35)                              etymology_string)  # find all matches for every string between {{ and }} in a non-greedy manner
36)         for id, match in enumerate(matches):
37)             parameters = match.split("|")
38)             # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
39)             if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
40)                 if id < len(matches):
41)                     next_match = matches[id + 1]
42)                     nm_parameters = next_match.split("|")
43)                     if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
44)                         ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
45)             # {{inh|en|gem-pro|*hwītaz}}
46)             if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
47)                 ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})

added support of another te...

Raymund Zacharias authored 6 years ago

48)             # {{der|it|la|alacer||lively; happy, cheerful}}
49)             if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
50)                 ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})