Raymund Zacharias commited on 2017-08-18 05:37:37
Zeige 2 geänderte Dateien mit 52 Einfügungen und 0 Löschungen.
| ... | ... |
@@ -0,0 +1,48 @@ |
| 1 |
+import io |
|
| 2 |
+import re |
|
| 3 |
+import urllib.request |
|
| 4 |
+import urllib.parse |
|
| 5 |
+import xml.etree.ElementTree as ET |
|
| 6 |
+ |
|
| 7 |
+ |
|
| 8 |
+def get_ancestor(title, language_code, ancestor_language_code): |
|
| 9 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
|
| 10 |
+ request_parameters = {
|
|
| 11 |
+ "action": "query", |
|
| 12 |
+ "format": "xml", |
|
| 13 |
+ "export": 1, |
|
| 14 |
+ "exportnowrap": 1, |
|
| 15 |
+ "titles": title, |
|
| 16 |
+ "redirects": 1 |
|
| 17 |
+ } |
|
| 18 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
| 19 |
+ request_data = request_data.encode('utf8')
|
|
| 20 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
|
|
| 21 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
| 22 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
|
|
| 23 |
+ wikipage_string = wikipage_response.read() |
|
| 24 |
+ root = ET.fromstring(wikipage_string) |
|
| 25 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
|
|
| 26 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
|
|
| 27 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
|
|
| 28 |
+ # ET.dump(root) |
|
| 29 |
+ # print(wikipage_text_string) |
|
| 30 |
+ match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string) |
|
| 31 |
+ etymology_string = wikipage_text_string[match.end():-1] |
|
| 32 |
+ ancestor_words = [] |
|
| 33 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
|
|
| 34 |
+ matches = re.findall(r"(?<={{).*?(?=}})",
|
|
| 35 |
+ etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
|
|
| 36 |
+ for id, match in enumerate(matches): |
|
| 37 |
+ parameters = match.split("|")
|
|
| 38 |
+ # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
|
|
| 39 |
+ if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code): |
|
| 40 |
+ if id < len(matches): |
|
| 41 |
+ next_match = matches[id + 1] |
|
| 42 |
+ nm_parameters = next_match.split("|")
|
|
| 43 |
+ if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code): |
|
| 44 |
+ ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
|
|
| 45 |
+ # {{inh|en|gem-pro|*hwītaz}}
|
|
| 46 |
+ if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code): |
|
| 47 |
+ ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|
|
| 48 |
+ return ancestor_words |
| ... | ... |
@@ -9,6 +9,7 @@ import urllib.request |
| 9 | 9 |
import urllib.parse |
| 10 | 10 |
import xml.etree.ElementTree as ET |
| 11 | 11 |
from descendants import get_PG_decendants, get_descendants |
| 12 |
+from ancestor import get_ancestor |
|
| 12 | 13 |
|
| 13 | 14 |
globalvars = {
|
| 14 | 15 |
"escape": False, |
| ... | ... |
@@ -76,6 +77,9 @@ def main(args): |
| 76 | 77 |
compare(wordpair) #compare the pair |
| 77 | 78 |
result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną")))
|
| 78 | 79 |
print(str(result)) |
| 80 |
+ ancestor_word = get_ancestor("white", "en", "gem-pro")[0]["lemma"]
|
|
| 81 |
+ ancestor_word = ancestor_word.lstrip("*")
|
|
| 82 |
+ print(get_PG_decendants(ancestor_word)) |
|
| 79 | 83 |
|
| 80 | 84 |
main(sys.argv[1:]) |
| 81 | 85 |
|
| 82 | 86 |