...
|
...
|
@@ -1,48 +1,48 @@
|
1
|
|
-import io
|
2
|
|
-import re
|
3
|
|
-import urllib.request
|
4
|
|
-import urllib.parse
|
5
|
|
-import xml.etree.ElementTree as ET
|
6
|
|
-
|
7
|
|
-
|
8
|
|
-def get_ancestor(title, language_code, ancestor_language_code):
|
9
|
|
- """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
|
10
|
|
- request_parameters = {
|
11
|
|
- "action": "query",
|
12
|
|
- "format": "xml",
|
13
|
|
- "export": 1,
|
14
|
|
- "exportnowrap": 1,
|
15
|
|
- "titles": title,
|
16
|
|
- "redirects": 1
|
17
|
|
- }
|
18
|
|
- request_data = urllib.parse.urlencode(request_parameters)
|
19
|
|
- request_data = request_data.encode('utf8')
|
20
|
|
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
21
|
|
- with urllib.request.urlopen(wikipage_request) as wikipage_response:
|
22
|
|
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
23
|
|
- wikipage_string = wikipage_response.read()
|
24
|
|
- root = ET.fromstring(wikipage_string)
|
25
|
|
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
26
|
|
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
27
|
|
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
28
|
|
- # ET.dump(root)
|
29
|
|
- # print(wikipage_text_string)
|
30
|
|
- match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
|
31
|
|
- etymology_string = wikipage_text_string[match.end():-1]
|
32
|
|
- ancestor_words = []
|
33
|
|
- # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
|
34
|
|
- matches = re.findall(r"(?<={{).*?(?=}})",
|
35
|
|
- etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
|
36
|
|
- for id, match in enumerate(matches):
|
37
|
|
- parameters = match.split("|")
|
38
|
|
- # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
|
39
|
|
- if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
|
40
|
|
- if id < len(matches):
|
41
|
|
- next_match = matches[id + 1]
|
42
|
|
- nm_parameters = next_match.split("|")
|
43
|
|
- if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
|
44
|
|
- ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
|
45
|
|
- # {{inh|en|gem-pro|*hwītaz}}
|
46
|
|
- if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
|
47
|
|
- ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|
48
|
|
- return ancestor_words
|
|
1
|
+import io
|
|
2
|
+import re
|
|
3
|
+import urllib.request
|
|
4
|
+import urllib.parse
|
|
5
|
+import xml.etree.ElementTree as ET
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+def get_ancestor(title, language_code, ancestor_language_code):
|
|
9
|
+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
|
|
10
|
+ request_parameters = {
|
|
11
|
+ "action": "query",
|
|
12
|
+ "format": "xml",
|
|
13
|
+ "export": 1,
|
|
14
|
+ "exportnowrap": 1,
|
|
15
|
+ "titles": title,
|
|
16
|
+ "redirects": 1
|
|
17
|
+ }
|
|
18
|
+ request_data = urllib.parse.urlencode(request_parameters)
|
|
19
|
+ request_data = request_data.encode('utf8')
|
|
20
|
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
|
21
|
+ with urllib.request.urlopen(wikipage_request) as wikipage_response:
|
|
22
|
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
23
|
+ wikipage_string = wikipage_response.read()
|
|
24
|
+ root = ET.fromstring(wikipage_string)
|
|
25
|
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
26
|
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
27
|
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
28
|
+ # ET.dump(root)
|
|
29
|
+ # print(wikipage_text_string)
|
|
30
|
+ match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
|
|
31
|
+ etymology_string = wikipage_text_string[match.end():-1]
|
|
32
|
+ ancestor_words = []
|
|
33
|
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
|
|
34
|
+ matches = re.findall(r"(?<={{).*?(?=}})",
|
|
35
|
+ etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
|
|
36
|
+ for id, match in enumerate(matches):
|
|
37
|
+ parameters = match.split("|")
|
|
38
|
+ # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
|
|
39
|
+ if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
|
|
40
|
+ if id < len(matches):
|
|
41
|
+ next_match = matches[id + 1]
|
|
42
|
+ nm_parameters = next_match.split("|")
|
|
43
|
+ if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
|
|
44
|
+ ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
|
|
45
|
+ # {{inh|en|gem-pro|*hwītaz}}
|
|
46
|
+ if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
|
|
47
|
+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|
|
48
|
+ return ancestor_words
|