refactored lookup code to avoid duplication (a08c3c7) - fs-workbench.git

refactored lookup code to avoid duplication

Raymund Zacharias commited on 2017-08-19 02:39:18
Zeige 4 geänderte Dateien mit 37 Einfügungen und 54 Löschungen.

source/tools/correlation/ancestor.py e971cde..fd4bdc4
source/tools/correlation/descendants.py 13a944b..aa73f12
source/tools/correlation/mediawiki.py 0000000..f8cfdde
source/tools/correlation/test.py c748a13..8397625

@@ -1,32 +1,9 @@
-import io
 import re
-import urllib.request
-import urllib.parse
-import xml.etree.ElementTree as ET
-
+from mediawiki import get_wikipage_text
 
 def get_ancestor(title, language_code, ancestor_language_code):
     """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
-    request_parameters = {
-        "action": "query",
-        "format": "xml",
-        "export": 1,
-        "exportnowrap": 1,
-        "titles": title,
-        "redirects": 1
-    }
-    request_data = urllib.parse.urlencode(request_parameters)
-    request_data = request_data.encode('utf8')
-    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
-    with urllib.request.urlopen(wikipage_request) as wikipage_response:
-        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
-        wikipage_string = wikipage_response.read()
-        root = ET.fromstring(wikipage_string)
-        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
-        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
-        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
-        # ET.dump(root)
-        # print(wikipage_text_string)
+    wikipage_text_string = get_wikipage_text(title)
     match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
     etymology_string = wikipage_text_string[match.end():-1]
     ancestor_words = []

source/tools/correlation/descendants.py

Zeige Datei @ a08c3c7

@@ -1,33 +1,12 @@
 __author__ = 'Ray'
 
-import sys
 import io
 import re
-import urllib.request
-import urllib.parse
-import xml.etree.ElementTree as ET
-
+from mediawiki import get_wikipage_text
 
 def get_descendants(title):
     """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
-    request_parameters = {
-        "action": "query",
-        "format": "xml",
-        "export": 1,
-        "exportnowrap": 1,
-        "titles": title,
-        "redirects": 1
-    }
-    request_data = urllib.parse.urlencode(request_parameters)
-    request_data = request_data.encode('utf8')
-    wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
-    with urllib.request.urlopen(wikipage_request) as wikipage_response:
-        # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
-        wikipage_string = wikipage_response.read()
-        root = ET.fromstring(wikipage_string)
-        ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
-        # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
-        wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
+    wikipage_text_string = get_wikipage_text(title)
     # ET.dump(root)
     # print(wikipage_text_string)
     match = re.search(r"\=+Descendants\=+", wikipage_text_string)

source/tools/correlation/mediawiki.py

Zeige Datei @ a08c3c7

...	...	@@ -0,0 +1,27 @@
	1	+import urllib.request
	2	+import urllib.parse
	3	+import xml.etree.ElementTree as ET
	4	+
	5	+def get_wikipage_text(title):
	6	+ request_parameters = {
	7	+ "action": "query",
	8	+ "format": "xml",
	9	+ "export": 1,
	10	+ "exportnowrap": 1,
	11	+ "titles": title,
	12	+ "redirects": 1
	13	+ }
	14	+ request_data = urllib.parse.urlencode(request_parameters)
	15	+ request_data = request_data.encode('utf8')
	16	+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php", request_data)
	17	+ print(wikipage_request)
	18	+ with urllib.request.urlopen(wikipage_request) as wikipage_response:
	19	+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
	20	+ wikipage_string = wikipage_response.read()
	21	+ root = ET.fromstring(wikipage_string)
	22	+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
	23	+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
	24	+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
	25	+ # ET.dump(root)
	26	+ # print(wikipage_text_string)
	27	+ return wikipage_text_string
0	28	\ No newline at end of file

source/tools/correlation/test.py

Zeige Datei @ a08c3c7

@@ -74,14 +74,14 @@ def main(args):
             word2 = globalvars["data"]["words"][j]
             if (word1["language"] == word2["language"]): #if languages match
                 wordpair = (word1, word2)
-                compare(wordpair) #compare the pair
+                #compare(wordpair) #compare the pair
     # result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
     # print(str(result))
-    # ancestor_word = get_ancestor("allègre", "fr", "la")[0]["lemma"]
-    # ancestor_word = ancestor_word.lstrip("*")
-    # print(get_PG_decendants(ancestor_word))
-    print(get_ancestor("allègre", "fr", "la"))
-    print(get_ancestor("allegro", "it", "la"))
+    ancestor_word = get_ancestor("froh", "de", "gem-pro")[0]["lemma"]
+    ancestor_word = ancestor_word.lstrip("*")
+    print(get_PG_decendants(ancestor_word))
+    # print(get_ancestor("allègre", "fr", "la"))
+    # print(get_ancestor("allegro", "it", "la"))
                 
 main(sys.argv[1:])
 


...	...	@@ -1,32 +1,9 @@
1		-import io
2	1	import re
3		-import urllib.request
4		-import urllib.parse
5		-import xml.etree.ElementTree as ET
6		-
	2	+from mediawiki import get_wikipage_text
7	3
8	4	def get_ancestor(title, language_code, ancestor_language_code):
9	5	"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
10		- request_parameters = {
11		- "action": "query",
12		- "format": "xml",
13		- "export": 1,
14		- "exportnowrap": 1,
15		- "titles": title,
16		- "redirects": 1
17		- }
18		- request_data = urllib.parse.urlencode(request_parameters)
19		- request_data = request_data.encode('utf8')
20		- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
21		- with urllib.request.urlopen(wikipage_request) as wikipage_response:
22		- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
23		- wikipage_string = wikipage_response.read()
24		- root = ET.fromstring(wikipage_string)
25		- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
26		- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
27		- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
28		- # ET.dump(root)
29		- # print(wikipage_text_string)
	6	+ wikipage_text_string = get_wikipage_text(title)
30	7	match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
31	8	etymology_string = wikipage_text_string[match.end():-1]
32	9	ancestor_words = []

...	...	@@ -1,33 +1,12 @@
1	1	__author__ = 'Ray'
2	2
3		-import sys
4	3	import io
5	4	import re
6		-import urllib.request
7		-import urllib.parse
8		-import xml.etree.ElementTree as ET
9		-
	5	+from mediawiki import get_wikipage_text
10	6
11	7	def get_descendants(title):
12	8	"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
13		- request_parameters = {
14		- "action": "query",
15		- "format": "xml",
16		- "export": 1,
17		- "exportnowrap": 1,
18		- "titles": title,
19		- "redirects": 1
20		- }
21		- request_data = urllib.parse.urlencode(request_parameters)
22		- request_data = request_data.encode('utf8')
23		- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
24		- with urllib.request.urlopen(wikipage_request) as wikipage_response:
25		- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
26		- wikipage_string = wikipage_response.read()
27		- root = ET.fromstring(wikipage_string)
28		- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
29		- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
30		- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
	9	+ wikipage_text_string = get_wikipage_text(title)
31	10	# ET.dump(root)
32	11	# print(wikipage_text_string)
33	12	match = re.search(r"\=+Descendants\=+", wikipage_text_string)

...	...	@@ -74,14 +74,14 @@ def main(args):
74	74	word2 = globalvars["data"]["words"][j]
75	75	if (word1["language"] == word2["language"]): #if languages match
76	76	wordpair = (word1, word2)
77		- compare(wordpair) #compare the pair
	77	+ #compare(wordpair) #compare the pair
78	78	# result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
79	79	# print(str(result))
80		- # ancestor_word = get_ancestor("allègre", "fr", "la")[0]["lemma"]
81		- # ancestor_word = ancestor_word.lstrip("*")
82		- # print(get_PG_decendants(ancestor_word))
83		- print(get_ancestor("allègre", "fr", "la"))
84		- print(get_ancestor("allegro", "it", "la"))
	80	+ ancestor_word = get_ancestor("froh", "de", "gem-pro")[0]["lemma"]
	81	+ ancestor_word = ancestor_word.lstrip("*")
	82	+ print(get_PG_decendants(ancestor_word))
	83	+ # print(get_ancestor("allègre", "fr", "la"))
	84	+ # print(get_ancestor("allegro", "it", "la"))
85	85
86	86	main(sys.argv[1:])
87	87
88	88