... | ... |
@@ -1,51 +1,28 @@ |
1 |
-import io |
|
2 | 1 |
import re |
3 |
-import urllib.request |
|
4 |
-import urllib.parse |
|
5 |
-import xml.etree.ElementTree as ET |
|
6 |
- |
|
2 |
+from mediawiki import get_wikipage_text |
|
7 | 3 |
|
8 | 4 |
def get_ancestor(title, language_code, ancestor_language_code): |
9 | 5 |
"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
10 |
- request_parameters = { |
|
11 |
- "action": "query", |
|
12 |
- "format": "xml", |
|
13 |
- "export": 1, |
|
14 |
- "exportnowrap": 1, |
|
15 |
- "titles": title, |
|
16 |
- "redirects": 1 |
|
17 |
- } |
|
18 |
- request_data = urllib.parse.urlencode(request_parameters) |
|
19 |
- request_data = request_data.encode('utf8') |
|
20 |
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) |
|
21 |
- with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
22 |
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
23 |
- wikipage_string = wikipage_response.read() |
|
24 |
- root = ET.fromstring(wikipage_string) |
|
25 |
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
26 |
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
27 |
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
28 |
- # ET.dump(root) |
|
29 |
- # print(wikipage_text_string) |
|
30 |
- match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) |
|
31 |
- etymology_string = wikipage_text_string[match.end():-1] |
|
32 |
- ancestor_words = [] |
|
33 |
- # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" |
|
34 |
- matches = re.findall(r"(?<={{).*?(?=}})", |
|
35 |
- etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner |
|
36 |
- for id, match in enumerate(matches): |
|
37 |
- parameters = match.split("|") |
|
38 |
- # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}} |
|
39 |
- if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code): |
|
40 |
- if id < len(matches): |
|
41 |
- next_match = matches[id + 1] |
|
42 |
- nm_parameters = next_match.split("|") |
|
43 |
- if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code): |
|
44 |
- ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]}) |
|
45 |
- # {{inh|en|gem-pro|*hwītaz}} |
|
46 |
- if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): |
|
47 |
- ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) |
|
48 |
- # {{der|it|la|alacer||lively; happy, cheerful}} |
|
49 |
- if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): |
|
50 |
- ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) |
|
51 |
- return ancestor_words |
|
6 |
+ wikipage_text_string = get_wikipage_text(title) |
|
7 |
+ match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) |
|
8 |
+ etymology_string = wikipage_text_string[match.end():-1] |
|
9 |
+ ancestor_words = [] |
|
10 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" |
|
11 |
+ matches = re.findall(r"(?<={{).*?(?=}})", |
|
12 |
+ etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner |
|
13 |
+ for id, match in enumerate(matches): |
|
14 |
+ parameters = match.split("|") |
|
15 |
+ # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}} |
|
16 |
+ if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code): |
|
17 |
+ if id < len(matches): |
|
18 |
+ next_match = matches[id + 1] |
|
19 |
+ nm_parameters = next_match.split("|") |
|
20 |
+ if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code): |
|
21 |
+ ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]}) |
|
22 |
+ # {{inh|en|gem-pro|*hwītaz}} |
|
23 |
+ if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): |
|
24 |
+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) |
|
25 |
+ # {{der|it|la|alacer||lively; happy, cheerful}} |
|
26 |
+ if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): |
|
27 |
+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]}) |
|
28 |
+ return ancestor_words |
... | ... |
@@ -1,48 +1,27 @@ |
1 | 1 |
__author__ = 'Ray' |
2 | 2 |
|
3 |
-import sys |
|
4 | 3 |
import io |
5 | 4 |
import re |
6 |
-import urllib.request |
|
7 |
-import urllib.parse |
|
8 |
-import xml.etree.ElementTree as ET |
|
9 |
- |
|
5 |
+from mediawiki import get_wikipage_text |
|
10 | 6 |
|
11 | 7 |
def get_descendants(title): |
12 | 8 |
"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
13 |
- request_parameters = { |
|
14 |
- "action": "query", |
|
15 |
- "format": "xml", |
|
16 |
- "export": 1, |
|
17 |
- "exportnowrap": 1, |
|
18 |
- "titles": title, |
|
19 |
- "redirects": 1 |
|
20 |
- } |
|
21 |
- request_data = urllib.parse.urlencode(request_parameters) |
|
22 |
- request_data = request_data.encode('utf8') |
|
23 |
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) |
|
24 |
- with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
25 |
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
26 |
- wikipage_string = wikipage_response.read() |
|
27 |
- root = ET.fromstring(wikipage_string) |
|
28 |
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
29 |
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
30 |
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
31 |
- # ET.dump(root) |
|
32 |
- # print(wikipage_text_string) |
|
33 |
- match = re.search(r"\=+Descendants\=+", wikipage_text_string) |
|
34 |
- descendants_string = wikipage_text_string[match.end():-1] |
|
35 |
- descendant_words = [] |
|
36 |
- # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" |
|
37 |
- descendants_buffer = io.StringIO(descendants_string) |
|
38 |
- for line in descendants_buffer: |
|
39 |
- matches = re.findall(r"(?<={{).*?(?=}})", |
|
40 |
- line) # find all matches for every string between {{ and }} in a non-greedy manner |
|
41 |
- for match in matches: |
|
42 |
- match_split = match.split("|") |
|
43 |
- if (match_split[0] == "l"): |
|
44 |
- descendant_words.append({"language": match_split[1], "lemma": match_split[2]}) |
|
45 |
- return descendant_words |
|
9 |
+ wikipage_text_string = get_wikipage_text(title) |
|
10 |
+ # ET.dump(root) |
|
11 |
+ # print(wikipage_text_string) |
|
12 |
+ match = re.search(r"\=+Descendants\=+", wikipage_text_string) |
|
13 |
+ descendants_string = wikipage_text_string[match.end():-1] |
|
14 |
+ descendant_words = [] |
|
15 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" |
|
16 |
+ descendants_buffer = io.StringIO(descendants_string) |
|
17 |
+ for line in descendants_buffer: |
|
18 |
+ matches = re.findall(r"(?<={{).*?(?=}})", |
|
19 |
+ line) # find all matches for every string between {{ and }} in a non-greedy manner |
|
20 |
+ for match in matches: |
|
21 |
+ match_split = match.split("|") |
|
22 |
+ if (match_split[0] == "l"): |
|
23 |
+ descendant_words.append({"language": match_split[1], "lemma": match_split[2]}) |
|
24 |
+ return descendant_words |
|
46 | 25 |
|
47 | 26 |
|
48 | 27 |
def get_PG_decendants(word): |
49 | 28 |
new file mode 100644 |
... | ... |
@@ -0,0 +1,27 @@ |
1 |
+import urllib.request |
|
2 |
+import urllib.parse |
|
3 |
+import xml.etree.ElementTree as ET |
|
4 |
+ |
|
5 |
+def get_wikipage_text(title): |
|
6 |
+ request_parameters = { |
|
7 |
+ "action": "query", |
|
8 |
+ "format": "xml", |
|
9 |
+ "export": 1, |
|
10 |
+ "exportnowrap": 1, |
|
11 |
+ "titles": title, |
|
12 |
+ "redirects": 1 |
|
13 |
+ } |
|
14 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
15 |
+ request_data = request_data.encode('utf8') |
|
16 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php", request_data) |
|
17 |
+ print(wikipage_request) |
|
18 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
19 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
20 |
+ wikipage_string = wikipage_response.read() |
|
21 |
+ root = ET.fromstring(wikipage_string) |
|
22 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
23 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
24 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
25 |
+ # ET.dump(root) |
|
26 |
+ # print(wikipage_text_string) |
|
27 |
+ return wikipage_text_string |
|
0 | 28 |
\ No newline at end of file |
... | ... |
@@ -74,14 +74,14 @@ def main(args): |
74 | 74 |
word2 = globalvars["data"]["words"][j] |
75 | 75 |
if (word1["language"] == word2["language"]): #if languages match |
76 | 76 |
wordpair = (word1, word2) |
77 |
- compare(wordpair) #compare the pair |
|
77 |
+ #compare(wordpair) #compare the pair |
|
78 | 78 |
# result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną"))) |
79 | 79 |
# print(str(result)) |
80 |
- # ancestor_word = get_ancestor("allègre", "fr", "la")[0]["lemma"] |
|
81 |
- # ancestor_word = ancestor_word.lstrip("*") |
|
82 |
- # print(get_PG_decendants(ancestor_word)) |
|
83 |
- print(get_ancestor("allègre", "fr", "la")) |
|
84 |
- print(get_ancestor("allegro", "it", "la")) |
|
80 |
+ ancestor_word = get_ancestor("froh", "de", "gem-pro")[0]["lemma"] |
|
81 |
+ ancestor_word = ancestor_word.lstrip("*") |
|
82 |
+ print(get_PG_decendants(ancestor_word)) |
|
83 |
+ # print(get_ancestor("allègre", "fr", "la")) |
|
84 |
+ # print(get_ancestor("allegro", "it", "la")) |
|
85 | 85 |
|
86 | 86 |
main(sys.argv[1:]) |
87 | 87 |
|