Raymund Zacharias commited on 2017-08-19 02:39:18
Zeige 4 geänderte Dateien mit 37 Einfügungen und 54 Löschungen.
| ... | ... |
@@ -1,32 +1,9 @@ |
| 1 |
-import io |
|
| 2 | 1 |
import re |
| 3 |
-import urllib.request |
|
| 4 |
-import urllib.parse |
|
| 5 |
-import xml.etree.ElementTree as ET |
|
| 6 |
- |
|
| 2 |
+from mediawiki import get_wikipage_text |
|
| 7 | 3 |
|
| 8 | 4 |
def get_ancestor(title, language_code, ancestor_language_code): |
| 9 | 5 |
"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
| 10 |
- request_parameters = {
|
|
| 11 |
- "action": "query", |
|
| 12 |
- "format": "xml", |
|
| 13 |
- "export": 1, |
|
| 14 |
- "exportnowrap": 1, |
|
| 15 |
- "titles": title, |
|
| 16 |
- "redirects": 1 |
|
| 17 |
- } |
|
| 18 |
- request_data = urllib.parse.urlencode(request_parameters) |
|
| 19 |
- request_data = request_data.encode('utf8')
|
|
| 20 |
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
|
| 21 |
- with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
| 22 |
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
| 23 |
- wikipage_string = wikipage_response.read() |
|
| 24 |
- root = ET.fromstring(wikipage_string) |
|
| 25 |
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
| 26 |
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
| 27 |
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
| 28 |
- # ET.dump(root) |
|
| 29 |
- # print(wikipage_text_string) |
|
| 6 |
+ wikipage_text_string = get_wikipage_text(title) |
|
| 30 | 7 |
match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) |
| 31 | 8 |
etymology_string = wikipage_text_string[match.end():-1] |
| 32 | 9 |
ancestor_words = [] |
| ... | ... |
@@ -1,33 +1,12 @@ |
| 1 | 1 |
__author__ = 'Ray' |
| 2 | 2 |
|
| 3 |
-import sys |
|
| 4 | 3 |
import io |
| 5 | 4 |
import re |
| 6 |
-import urllib.request |
|
| 7 |
-import urllib.parse |
|
| 8 |
-import xml.etree.ElementTree as ET |
|
| 9 |
- |
|
| 5 |
+from mediawiki import get_wikipage_text |
|
| 10 | 6 |
|
| 11 | 7 |
def get_descendants(title): |
| 12 | 8 |
"""Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
| 13 |
- request_parameters = {
|
|
| 14 |
- "action": "query", |
|
| 15 |
- "format": "xml", |
|
| 16 |
- "export": 1, |
|
| 17 |
- "exportnowrap": 1, |
|
| 18 |
- "titles": title, |
|
| 19 |
- "redirects": 1 |
|
| 20 |
- } |
|
| 21 |
- request_data = urllib.parse.urlencode(request_parameters) |
|
| 22 |
- request_data = request_data.encode('utf8')
|
|
| 23 |
- wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
|
| 24 |
- with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
| 25 |
- # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
| 26 |
- wikipage_string = wikipage_response.read() |
|
| 27 |
- root = ET.fromstring(wikipage_string) |
|
| 28 |
- ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
| 29 |
- # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
| 30 |
- wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
| 9 |
+ wikipage_text_string = get_wikipage_text(title) |
|
| 31 | 10 |
# ET.dump(root) |
| 32 | 11 |
# print(wikipage_text_string) |
| 33 | 12 |
match = re.search(r"\=+Descendants\=+", wikipage_text_string) |
| ... | ... |
@@ -0,0 +1,27 @@ |
| 1 |
+import urllib.request |
|
| 2 |
+import urllib.parse |
|
| 3 |
+import xml.etree.ElementTree as ET |
|
| 4 |
+ |
|
| 5 |
+def get_wikipage_text(title): |
|
| 6 |
+ request_parameters = {
|
|
| 7 |
+ "action": "query", |
|
| 8 |
+ "format": "xml", |
|
| 9 |
+ "export": 1, |
|
| 10 |
+ "exportnowrap": 1, |
|
| 11 |
+ "titles": title, |
|
| 12 |
+ "redirects": 1 |
|
| 13 |
+ } |
|
| 14 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
| 15 |
+ request_data = request_data.encode('utf8')
|
|
| 16 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php", request_data)
|
|
| 17 |
+ print(wikipage_request) |
|
| 18 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
| 19 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
| 20 |
+ wikipage_string = wikipage_response.read() |
|
| 21 |
+ root = ET.fromstring(wikipage_string) |
|
| 22 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
| 23 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
| 24 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
| 25 |
+ # ET.dump(root) |
|
| 26 |
+ # print(wikipage_text_string) |
|
| 27 |
+ return wikipage_text_string |
|
| 0 | 28 |
\ No newline at end of file |
| ... | ... |
@@ -74,14 +74,14 @@ def main(args): |
| 74 | 74 |
word2 = globalvars["data"]["words"][j] |
| 75 | 75 |
if (word1["language"] == word2["language"]): #if languages match |
| 76 | 76 |
wordpair = (word1, word2) |
| 77 |
- compare(wordpair) #compare the pair |
|
| 77 |
+ #compare(wordpair) #compare the pair |
|
| 78 | 78 |
# result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
|
| 79 | 79 |
# print(str(result)) |
| 80 |
- # ancestor_word = get_ancestor("allègre", "fr", "la")[0]["lemma"]
|
|
| 81 |
- # ancestor_word = ancestor_word.lstrip("*")
|
|
| 82 |
- # print(get_PG_decendants(ancestor_word)) |
|
| 83 |
- print(get_ancestor("allègre", "fr", "la"))
|
|
| 84 |
- print(get_ancestor("allegro", "it", "la"))
|
|
| 80 |
+ ancestor_word = get_ancestor("froh", "de", "gem-pro")[0]["lemma"]
|
|
| 81 |
+ ancestor_word = ancestor_word.lstrip("*")
|
|
| 82 |
+ print(get_PG_decendants(ancestor_word)) |
|
| 83 |
+ # print(get_ancestor("allègre", "fr", "la"))
|
|
| 84 |
+ # print(get_ancestor("allegro", "it", "la"))
|
|
| 85 | 85 |
|
| 86 | 86 |
main(sys.argv[1:]) |
| 87 | 87 |
|
| 88 | 88 |