new line type adjusted(?)
Raymund Zacharias authored 6 years ago
|
1) import io
2) import re
3) import urllib.request
4) import urllib.parse
5) import xml.etree.ElementTree as ET
6)
7)
8) def get_ancestor(title, language_code, ancestor_language_code):
9) """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)"""
10) request_parameters = {
11) "action": "query",
12) "format": "xml",
13) "export": 1,
14) "exportnowrap": 1,
15) "titles": title,
16) "redirects": 1
17) }
18) request_data = urllib.parse.urlencode(request_parameters)
19) request_data = request_data.encode('utf8')
20) wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data)
21) with urllib.request.urlopen(wikipage_request) as wikipage_response:
22) # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response:
23) wikipage_string = wikipage_response.read()
24) root = ET.fromstring(wikipage_string)
25) ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"}
26) # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text
27) wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text
28) # ET.dump(root)
29) # print(wikipage_text_string)
30) match = re.search(r"\=+Etymology.*?\=+", wikipage_text_string)
31) etymology_string = wikipage_text_string[match.end():-1]
32) ancestor_words = []
33) # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|"
34) matches = re.findall(r"(?<={{).*?(?=}})",
35) etymology_string) # find all matches for every string between {{ and }} in a non-greedy manner
36) for id, match in enumerate(matches):
37) parameters = match.split("|")
38) # {{etyl|gem-pro|en}} {{m|gem-pro|*sagô}}
39) if (parameters[0] == "etyl") and (parameters[1] == ancestor_language_code) and (parameters[2] == language_code):
40) if id < len(matches):
41) next_match = matches[id + 1]
42) nm_parameters = next_match.split("|")
43) if (nm_parameters[0] == "m") and (nm_parameters[1] == ancestor_language_code):
44) ancestor_words.append({"language": nm_parameters[1], "lemma": nm_parameters[2]})
45) # {{inh|en|gem-pro|*hwītaz}}
46) if (parameters[0] == "inh") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
47) ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|
added support of another te...
Raymund Zacharias authored 6 years ago
|
48) # {{der|it|la|alacer||lively; happy, cheerful}}
49) if (parameters[0] == "der") and (parameters[1] == language_code) and (parameters[2] == ancestor_language_code):
50) ancestor_words.append({"language": parameters[2], "lemma": parameters[3]})
|