Raymund Zacharias commited on 2017-08-18 05:37:37
Zeige 2 geänderte Dateien mit 52 Einfügungen und 0 Löschungen.
... | ... |
@@ -0,0 +1,48 @@ |
1 |
+import io |
|
2 |
+import re |
|
3 |
+import urllib.request |
|
4 |
+import urllib.parse |
|
5 |
+import xml.etree.ElementTree as ET |
|
6 |
+ |
|
7 |
+ |
|
8 |
+def get_ancestor(title, language_code, ancestor_language_code): |
|
9 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
|
10 |
+ request_parameters = { |
|
11 |
+ "action": "query", |
|
12 |
+ "format": "xml", |
|
13 |
+ "export": 1, |
|
14 |
+ "exportnowrap": 1, |
|
15 |
+ "titles": title, |
|
16 |
+ "redirects": 1 |
|
17 |
+ } |
|
18 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
19 |
+ request_data = request_data.encode('utf8') |
|
20 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) |
|
21 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
22 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
23 |
+ wikipage_string = wikipage_response.read() |
|
24 |
+ root = ET.fromstring(wikipage_string) |
|
25 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
26 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
27 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
28 |
+ # ET.dump(root) |
|
29 |
+ # print(wikipage_text_string) |
|
30 |
+ match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) |
|
31 |
+ etymology_string = wikipage_text_string[match.end():-1] |
|
32 |
+ ancestor_words = [] |
|
33 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" |
|
34 |
+ matches = re.findall(r"(?<={{).*?(?=}})", |
|
35 |
+ etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner |
|
36 |
+ for id, match in enumerate(matches): |
|
37 |
+ parameters = match.split("|") |
|
38 |
+ # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}} |
|
39 |
+ if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code): |
|
40 |
+ if id < len(matches): |
|
41 |
+ next_match = matches[id + 1] |
|
42 |
+ nm_parameters = next_match.split("|") |
|
43 |
+ if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code): |
|
44 |
+ ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]}) |
|
45 |
+ # {{inh|en|gem-pro|*hwītaz}} |
|
46 |
+ if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): |
|
47 |
+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) |
|
48 |
+ return ancestor_words |
... | ... |
@@ -9,6 +9,7 @@ import urllib.request |
9 | 9 |
import urllib.parse |
10 | 10 |
import xml.etree.ElementTree as ET |
11 | 11 |
from descendants import get_PG_decendants, get_descendants |
12 |
+from ancestor import get_ancestor |
|
12 | 13 |
|
13 | 14 |
globalvars = { |
14 | 15 |
"escape": False, |
... | ... |
@@ -76,6 +77,9 @@ def main(args): |
76 | 77 |
compare(wordpair) #compare the pair |
77 | 78 |
result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną"))) |
78 | 79 |
print(str(result)) |
80 |
+ ancestor_word = get_ancestor("white", "en", "gem-pro")[0]["lemma"] |
|
81 |
+ ancestor_word = ancestor_word.lstrip("*") |
|
82 |
+ print(get_PG_decendants(ancestor_word)) |
|
79 | 83 |
|
80 | 84 |
main(sys.argv[1:]) |
81 | 85 |
|
82 | 86 |