... |
... |
@@ -1,48 +1,48 @@
|
1 |
|
-import io
|
2 |
|
-import re
|
3 |
|
-import urllib.request
|
4 |
|
-import urllib.parse
|
5 |
|
-import xml.etree.ElementTree as ET
|
6 |
|
-
|
7 |
|
-
|
8 |
|
-def get_ancestor(title, language_code, ancestor_language_code):
|
9 |
|
- """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
|
10 |
|
- request_parameters = {
|
11 |
|
- "action": "query",
|
12 |
|
- "format": "xml",
|
13 |
|
- "export": 1,
|
14 |
|
- "exportnowrap": 1,
|
15 |
|
- "titles": title,
|
16 |
|
- "redirects": 1
|
17 |
|
- }
|
18 |
|
- request_data = urllib.parse.urlencode(request_parameters)
|
19 |
|
- request_data = request_data.encode('utf8')
|
20 |
|
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
21 |
|
- with urllib.request.urlopen(wikipage_request) as wikipage_response:
|
22 |
|
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
23 |
|
- wikipage_string = wikipage_response.read()
|
24 |
|
- root = ET.fromstring(wikipage_string)
|
25 |
|
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
26 |
|
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
27 |
|
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
28 |
|
- # ET.dump(root)
|
29 |
|
- # print(wikipage_text_string)
|
30 |
|
- match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
|
31 |
|
- etymology_string = wikipage_text_string[match.end():-1]
|
32 |
|
- ancestor_words = []
|
33 |
|
- # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
|
34 |
|
- matches = re.findall(r"(?<={{).*?(?=}})",
|
35 |
|
- etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
|
36 |
|
- for id, match in enumerate(matches):
|
37 |
|
- parameters = match.split("|")
|
38 |
|
- # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
|
39 |
|
- if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
|
40 |
|
- if id < len(matches):
|
41 |
|
- next_match = matches[id + 1]
|
42 |
|
- nm_parameters = next_match.split("|")
|
43 |
|
- if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
|
44 |
|
- ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
|
45 |
|
- # {{inh|en|gem-pro|*hwītaz}}
|
46 |
|
- if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
|
47 |
|
- ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|
48 |
|
- return ancestor_words
|
|
1 |
+import io
|
|
2 |
+import re
|
|
3 |
+import urllib.request
|
|
4 |
+import urllib.parse
|
|
5 |
+import xml.etree.ElementTree as ET
|
|
6 |
+
|
|
7 |
+
|
|
8 |
+def get_ancestor(title, language_code, ancestor_language_code):
|
|
9 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
|
|
10 |
+ request_parameters = {
|
|
11 |
+ "action": "query",
|
|
12 |
+ "format": "xml",
|
|
13 |
+ "export": 1,
|
|
14 |
+ "exportnowrap": 1,
|
|
15 |
+ "titles": title,
|
|
16 |
+ "redirects": 1
|
|
17 |
+ }
|
|
18 |
+ request_data = urllib.parse.urlencode(request_parameters)
|
|
19 |
+ request_data = request_data.encode('utf8')
|
|
20 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
|
21 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response:
|
|
22 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
23 |
+ wikipage_string = wikipage_response.read()
|
|
24 |
+ root = ET.fromstring(wikipage_string)
|
|
25 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
26 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
27 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
28 |
+ # ET.dump(root)
|
|
29 |
+ # print(wikipage_text_string)
|
|
30 |
+ match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
|
|
31 |
+ etymology_string = wikipage_text_string[match.end():-1]
|
|
32 |
+ ancestor_words = []
|
|
33 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
|
|
34 |
+ matches = re.findall(r"(?<={{).*?(?=}})",
|
|
35 |
+ etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
|
|
36 |
+ for id, match in enumerate(matches):
|
|
37 |
+ parameters = match.split("|")
|
|
38 |
+ # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
|
|
39 |
+ if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
|
|
40 |
+ if id < len(matches):
|
|
41 |
+ next_match = matches[id + 1]
|
|
42 |
+ nm_parameters = next_match.split("|")
|
|
43 |
+ if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
|
|
44 |
+ ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
|
|
45 |
+ # {{inh|en|gem-pro|*hwītaz}}
|
|
46 |
+ if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
|
|
47 |
+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|
|
48 |
+ return ancestor_words
|