Raymund Zacharias commited on 2017-08-19 02:39:18
Zeige 4 geänderte Dateien mit 37 Einfügungen und 54 Löschungen.
... | ... |
@@ -1,32 +1,9 @@ |
1 |
-import io |
|
2 | 1 |
import re |
3 |
-import urllib.request |
|
4 |
-import urllib.parse |
|
5 |
-import xml.etree.ElementTree as ET |
|
6 |
- |
|
2 |
+from mediawiki import get_wikipage_text |
|
7 | 3 |
|
8 | 4 |
def get_ancestor(title, language_code, ancestor_language_code): |
9 | 5 |
"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
10 |
- request_parameters = { |
|
11 |
- "action": "query", |
|
12 |
- "format": "xml", |
|
13 |
- "export": 1, |
|
14 |
- "exportnowrap": 1, |
|
15 |
- "titles": title, |
|
16 |
- "redirects": 1 |
|
17 |
- } |
|
18 |
- request_data = urllib.parse.urlencode(request_parameters) |
|
19 |
- request_data = request_data.encode('utf8') |
|
20 |
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) |
|
21 |
- with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
22 |
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
23 |
- wikipage_string = wikipage_response.read() |
|
24 |
- root = ET.fromstring(wikipage_string) |
|
25 |
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
26 |
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
27 |
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
28 |
- # ET.dump(root) |
|
29 |
- # print(wikipage_text_string) |
|
6 |
+ wikipage_text_string = get_wikipage_text(title) |
|
30 | 7 |
match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) |
31 | 8 |
etymology_string = wikipage_text_string[match.end():-1] |
32 | 9 |
ancestor_words = [] |
... | ... |
@@ -1,33 +1,12 @@ |
1 | 1 |
__author__ = 'Ray' |
2 | 2 |
|
3 |
-import sys |
|
4 | 3 |
import io |
5 | 4 |
import re |
6 |
-import urllib.request |
|
7 |
-import urllib.parse |
|
8 |
-import xml.etree.ElementTree as ET |
|
9 |
- |
|
5 |
+from mediawiki import get_wikipage_text |
|
10 | 6 |
|
11 | 7 |
def get_descendants(title): |
12 | 8 |
"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
13 |
- request_parameters = { |
|
14 |
- "action": "query", |
|
15 |
- "format": "xml", |
|
16 |
- "export": 1, |
|
17 |
- "exportnowrap": 1, |
|
18 |
- "titles": title, |
|
19 |
- "redirects": 1 |
|
20 |
- } |
|
21 |
- request_data = urllib.parse.urlencode(request_parameters) |
|
22 |
- request_data = request_data.encode('utf8') |
|
23 |
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) |
|
24 |
- with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
25 |
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
26 |
- wikipage_string = wikipage_response.read() |
|
27 |
- root = ET.fromstring(wikipage_string) |
|
28 |
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
29 |
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
30 |
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
9 |
+ wikipage_text_string = get_wikipage_text(title) |
|
31 | 10 |
# ET.dump(root) |
32 | 11 |
# print(wikipage_text_string) |
33 | 12 |
match = re.search(r"\=+Descendants\=+", wikipage_text_string) |
... | ... |
@@ -0,0 +1,27 @@ |
1 |
+import urllib.request |
|
2 |
+import urllib.parse |
|
3 |
+import xml.etree.ElementTree as ET |
|
4 |
+ |
|
5 |
+def get_wikipage_text(title): |
|
6 |
+ request_parameters = { |
|
7 |
+ "action": "query", |
|
8 |
+ "format": "xml", |
|
9 |
+ "export": 1, |
|
10 |
+ "exportnowrap": 1, |
|
11 |
+ "titles": title, |
|
12 |
+ "redirects": 1 |
|
13 |
+ } |
|
14 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
15 |
+ request_data = request_data.encode('utf8') |
|
16 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php", request_data) |
|
17 |
+ print(wikipage_request) |
|
18 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
19 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
20 |
+ wikipage_string = wikipage_response.read() |
|
21 |
+ root = ET.fromstring(wikipage_string) |
|
22 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
23 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
24 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
25 |
+ # ET.dump(root) |
|
26 |
+ # print(wikipage_text_string) |
|
27 |
+ return wikipage_text_string |
|
0 | 28 |
\ No newline at end of file |
... | ... |
@@ -74,14 +74,14 @@ def main(args): |
74 | 74 |
word2 = globalvars["data"]["words"][j] |
75 | 75 |
if (word1["language"] == word2["language"]): #if languages match |
76 | 76 |
wordpair = (word1, word2) |
77 |
- compare(wordpair) #compare the pair |
|
77 |
+ #compare(wordpair) #compare the pair |
|
78 | 78 |
# result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną"))) |
79 | 79 |
# print(str(result)) |
80 |
- # ancestor_word = get_ancestor("allègre", "fr", "la")[0]["lemma"] |
|
81 |
- # ancestor_word = ancestor_word.lstrip("*") |
|
82 |
- # print(get_PG_decendants(ancestor_word)) |
|
83 |
- print(get_ancestor("allègre", "fr", "la")) |
|
84 |
- print(get_ancestor("allegro", "it", "la")) |
|
80 |
+ ancestor_word = get_ancestor("froh", "de", "gem-pro")[0]["lemma"] |
|
81 |
+ ancestor_word = ancestor_word.lstrip("*") |
|
82 |
+ print(get_PG_decendants(ancestor_word)) |
|
83 |
+ # print(get_ancestor("allègre", "fr", "la")) |
|
84 |
+ # print(get_ancestor("allegro", "it", "la")) |
|
85 | 85 |
|
86 | 86 |
main(sys.argv[1:]) |
87 | 87 |
|
88 | 88 |