Browse code

refactored lookup code to avoid duplication

Raymund Zacharias authored on 19/08/2017 02:39:18
Showing 4 changed files
... ...
@@ -1,51 +1,28 @@
1
-import io
2 1
 import re
3
-import urllib.request
4
-import urllib.parse
5
-import xml.etree.ElementTree as ET
6
-
2
+from mediawiki import get_wikipage_text
7 3
 
8 4
 def get_ancestor(title, language_code, ancestor_language_code):
9 5
     """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
10
-    request_parameters = {
11
-        "action": "query",
12
-        "format": "xml",
13
-        "export": 1,
14
-        "exportnowrap": 1,
15
-        "titles": title,
16
-        "redirects": 1
17
-    }
18
-    request_data = urllib.parse.urlencode(request_parameters)
19
-    request_data = request_data.encode('utf8')
20
-    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
21
-    with urllib.request.urlopen(wikipage_request) as wikipage_response:
22
-        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
23
-        wikipage_string = wikipage_response.read()
24
-        root = ET.fromstring(wikipage_string)
25
-        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
26
-        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
27
-        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
28
-        # ET.dump(root)
29
-        # print(wikipage_text_string)
30
-        match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
31
-        etymology_string = wikipage_text_string[match.end():-1]
32
-        ancestor_words = []
33
-        # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
34
-        matches = re.findall(r"(?<={{).*?(?=}})",
35
-                             etymology_string)  # find all matches for every string between {{ and }} in a non-greedy manner
36
-        for id, match in enumerate(matches):
37
-            parameters = match.split("|")
38
-            # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
39
-            if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
40
-                if id < len(matches):
41
-                    next_match = matches[id + 1]
42
-                    nm_parameters = next_match.split("|")
43
-                    if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
44
-                        ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
45
-            # {{inh|en|gem-pro|*hwītaz}}
46
-            if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
47
-                ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
48
-            # {{der|it|la|alacer||lively; happy, cheerful}}
49
-            if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
50
-                ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
51
-        return ancestor_words
6
+    wikipage_text_string = get_wikipage_text(title)
7
+    match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
8
+    etymology_string = wikipage_text_string[match.end():-1]
9
+    ancestor_words = []
10
+    # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
11
+    matches = re.findall(r"(?<={{).*?(?=}})",
12
+                         etymology_string)  # find all matches for every string between {{ and }} in a non-greedy manner
13
+    for id, match in enumerate(matches):
14
+        parameters = match.split("|")
15
+        # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
16
+        if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
17
+            if id < len(matches):
18
+                next_match = matches[id + 1]
19
+                nm_parameters = next_match.split("|")
20
+                if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
21
+                    ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
22
+        # {{inh|en|gem-pro|*hwītaz}}
23
+        if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
24
+            ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
25
+        # {{der|it|la|alacer||lively; happy, cheerful}}
26
+        if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
27
+            ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
28
+    return ancestor_words
... ...
@@ -1,48 +1,27 @@
1 1
 __author__ = 'Ray'
2 2
 
3
-import sys
4 3
 import io
5 4
 import re
6
-import urllib.request
7
-import urllib.parse
8
-import xml.etree.ElementTree as ET
9
-
5
+from mediawiki import get_wikipage_text
10 6
 
11 7
 def get_descendants(title):
12 8
     """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
13
-    request_parameters = {
14
-        "action": "query",
15
-        "format": "xml",
16
-        "export": 1,
17
-        "exportnowrap": 1,
18
-        "titles": title,
19
-        "redirects": 1
20
-    }
21
-    request_data = urllib.parse.urlencode(request_parameters)
22
-    request_data = request_data.encode('utf8')
23
-    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
24
-    with urllib.request.urlopen(wikipage_request) as wikipage_response:
25
-        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
26
-        wikipage_string = wikipage_response.read()
27
-        root = ET.fromstring(wikipage_string)
28
-        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
29
-        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
30
-        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
31
-        # ET.dump(root)
32
-        # print(wikipage_text_string)
33
-        match = re.search(r"\=+Descendants\=+", wikipage_text_string)
34
-        descendants_string = wikipage_text_string[match.end():-1]
35
-        descendant_words = []
36
-        # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
37
-        descendants_buffer = io.StringIO(descendants_string)
38
-        for line in descendants_buffer:
39
-            matches = re.findall(r"(?<={{).*?(?=}})",
40
-                                 line)  # find all matches for every string between {{ and }} in a non-greedy manner
41
-            for match in matches:
42
-                match_split = match.split("|")
43
-                if (match_split[0] == "l"):
44
-                    descendant_words.append({"language": match_split[1], "lemma": match_split[2]})
45
-        return descendant_words
9
+    wikipage_text_string = get_wikipage_text(title)
10
+    # ET.dump(root)
11
+    # print(wikipage_text_string)
12
+    match = re.search(r"\=+Descendants\=+", wikipage_text_string)
13
+    descendants_string = wikipage_text_string[match.end():-1]
14
+    descendant_words = []
15
+    # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
16
+    descendants_buffer = io.StringIO(descendants_string)
17
+    for line in descendants_buffer:
18
+        matches = re.findall(r"(?<={{).*?(?=}})",
19
+                             line)  # find all matches for every string between {{ and }} in a non-greedy manner
20
+        for match in matches:
21
+            match_split = match.split("|")
22
+            if (match_split[0] == "l"):
23
+                descendant_words.append({"language": match_split[1], "lemma": match_split[2]})
24
+    return descendant_words
46 25
 
47 26
 
48 27
 def get_PG_decendants(word):
49 28
new file mode 100644
... ...
@@ -0,0 +1,27 @@
1
+import urllib.request
2
+import urllib.parse
3
+import xml.etree.ElementTree as ET
4
+
5
+def get_wikipage_text(title):
6
+    request_parameters = {
7
+            "action": "query",
8
+            "format": "xml",
9
+            "export": 1,
10
+            "exportnowrap": 1,
11
+            "titles": title,
12
+            "redirects": 1
13
+        }
14
+    request_data = urllib.parse.urlencode(request_parameters)
15
+    request_data = request_data.encode('utf8')
16
+    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php", request_data)
17
+    print(wikipage_request)
18
+    with urllib.request.urlopen(wikipage_request) as wikipage_response:
19
+        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
20
+        wikipage_string = wikipage_response.read()
21
+        root = ET.fromstring(wikipage_string)
22
+        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
23
+        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
24
+        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
25
+        # ET.dump(root)
26
+        # print(wikipage_text_string)
27
+        return wikipage_text_string
0 28
\ No newline at end of file
... ...
@@ -74,14 +74,14 @@ def main(args):
74 74
             word2 = globalvars["data"]["words"][j]
75 75
             if (word1["language"] == word2["language"]): #if languages match
76 76
                 wordpair = (word1, word2)
77
-                compare(wordpair) #compare the pair
77
+                #compare(wordpair) #compare the pair
78 78
     # result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
79 79
     # print(str(result))
80
-    # ancestor_word = get_ancestor("allègre", "fr", "la")[0]["lemma"]
81
-    # ancestor_word = ancestor_word.lstrip("*")
82
-    # print(get_PG_decendants(ancestor_word))
83
-    print(get_ancestor("allègre", "fr", "la"))
84
-    print(get_ancestor("allegro", "it", "la"))
80
+    ancestor_word = get_ancestor("froh", "de", "gem-pro")[0]["lemma"]
81
+    ancestor_word = ancestor_word.lstrip("*")
82
+    print(get_PG_decendants(ancestor_word))
83
+    # print(get_ancestor("allègre", "fr", "la"))
84
+    # print(get_ancestor("allegro", "it", "la"))
85 85
                 
86 86
 main(sys.argv[1:])
87 87