__author__ = 'Ray' import io import re from mediawiki import get_wikipage_text def get_descendants(title): """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" wikipage_text_string = get_wikipage_text(title) # ET.dump(root) # print(wikipage_text_string) match = re.search(r"\=+Descendants\=+", wikipage_text_string) descendants_string = wikipage_text_string[match.end():-1] descendant_words = [] # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" descendants_buffer = io.StringIO(descendants_string) for line in descendants_buffer: matches = re.findall(r"(?<={{).*?(?=}})", line) # find all matches for every string between {{ and }} in a non-greedy manner for match in matches: match_split = match.split("|") if (match_split[0] == "l"): descendant_words.append({"language": match_split[1], "lemma": match_split[2]}) return descendant_words def get_PG_decendants(word): """Get and parse the descendant section from en.wiktionary.org for a specific gem-pro word""" pg_prefix = "Reconstruction:Proto-Germanic/" return get_descendants(pg_prefix + word)