Browse code

new line type adjusted(?)

Raymund Zacharias authored on 18/08/2017 05:42:56
Showing 1 changed files
... ...
@@ -1,48 +1,48 @@
1
-import io
2
-import re
3
-import urllib.request
4
-import urllib.parse
5
-import xml.etree.ElementTree as ET
6
-
7
-
8
-def get_ancestor(title, language_code, ancestor_language_code):
9
-    """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
10
-    request_parameters = {
11
-        "action": "query",
12
-        "format": "xml",
13
-        "export": 1,
14
-        "exportnowrap": 1,
15
-        "titles": title,
16
-        "redirects": 1
17
-    }
18
-    request_data = urllib.parse.urlencode(request_parameters)
19
-    request_data = request_data.encode('utf8')
20
-    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
21
-    with urllib.request.urlopen(wikipage_request) as wikipage_response:
22
-        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
23
-        wikipage_string = wikipage_response.read()
24
-        root = ET.fromstring(wikipage_string)
25
-        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
26
-        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
27
-        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
28
-        # ET.dump(root)
29
-        # print(wikipage_text_string)
30
-        match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
31
-        etymology_string = wikipage_text_string[match.end():-1]
32
-        ancestor_words = []
33
-        # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
34
-        matches = re.findall(r"(?<={{).*?(?=}})",
35
-                             etymology_string)  # find all matches for every string between {{ and }} in a non-greedy manner
36
-        for id, match in enumerate(matches):
37
-            parameters = match.split("|")
38
-            # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
39
-            if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
40
-                if id < len(matches):
41
-                    next_match = matches[id + 1]
42
-                    nm_parameters = next_match.split("|")
43
-                    if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
44
-                        ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
45
-            # {{inh|en|gem-pro|*hwītaz}}
46
-            if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
47
-                ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
48
-        return ancestor_words
1
+import io
2
+import re
3
+import urllib.request
4
+import urllib.parse
5
+import xml.etree.ElementTree as ET
6
+
7
+
8
+def get_ancestor(title, language_code, ancestor_language_code):
9
+    """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
10
+    request_parameters = {
11
+        "action": "query",
12
+        "format": "xml",
13
+        "export": 1,
14
+        "exportnowrap": 1,
15
+        "titles": title,
16
+        "redirects": 1
17
+    }
18
+    request_data = urllib.parse.urlencode(request_parameters)
19
+    request_data = request_data.encode('utf8')
20
+    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
21
+    with urllib.request.urlopen(wikipage_request) as wikipage_response:
22
+        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
23
+        wikipage_string = wikipage_response.read()
24
+        root = ET.fromstring(wikipage_string)
25
+        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
26
+        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
27
+        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
28
+        # ET.dump(root)
29
+        # print(wikipage_text_string)
30
+        match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
31
+        etymology_string = wikipage_text_string[match.end():-1]
32
+        ancestor_words = []
33
+        # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
34
+        matches = re.findall(r"(?<={{).*?(?=}})",
35
+                             etymology_string)  # find all matches for every string between {{ and }} in a non-greedy manner
36
+        for id, match in enumerate(matches):
37
+            parameters = match.split("|")
38
+            # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
39
+            if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
40
+                if id < len(matches):
41
+                    next_match = matches[id + 1]
42
+                    nm_parameters = next_match.split("|")
43
+                    if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
44
+                        ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
45
+            # {{inh|en|gem-pro|*hwītaz}}
46
+            if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
47
+                ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
48
+        return ancestor_words