git.schokokeks.org
Repositories
Help
Report an Issue
fs-workbench.git
Code
Commits
Branches
Tags
Suche
Strukturansicht:
9d1aeac
Branches
Tags
master
fs-workbench.git
source
tools
correlation
descendants.py
separated the descendants acquisition logic from test.py and implemented actual fetching of wiktionary data
Raymund Zacharias
commited
9d1aeac
at 2017-08-14 10:59:04
descendants.py
Blame
History
Raw
__author__ = 'Ray' import sys import io import re import urllib.request import urllib.parse import xml.etree.ElementTree as ET def get_descendants(title): """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" request_parameters = { "action": "query", "format": "xml", "export": 1, "exportnowrap": 1, "titles": title, "redirects": 1 } request_data = urllib.parse.urlencode(request_parameters) request_data = request_data.encode('utf8') wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) with urllib.request.urlopen(wikipage_request) as wikipage_response: # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: wikipage_string = wikipage_response.read() root = ET.fromstring(wikipage_string) ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text # ET.dump(root) # print(wikipage_text_string) match = re.search(r"\=+Descendants\=+", wikipage_text_string) descendants_string = wikipage_text_string[match.end():-1] descendant_words = [] # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" descendants_buffer = io.StringIO(descendants_string) for line in descendants_buffer: matches = re.findall(r"(?<={{).*?(?=}})", line) # find all matches for every string between {{ and }} in a non-greedy manner for match in matches: match_split = match.split("|") if (match_split[0] == "l"): descendant_words.append({"language": match_split[1], "lemma": match_split[2]}) return descendant_words def get_PG_decendants(word): """Get and parse the descendant section from en.wiktionary.org for a specific gem-pro word""" pg_prefix = "Reconstruction:Proto-Germanic/" return get_descendants(pg_prefix + word)