Raymund Zacharias commited on 2017-08-14 10:59:04
Zeige 3 geänderte Dateien mit 257 Einfügungen und 0 Löschungen.
... | ... |
@@ -0,0 +1,125 @@ |
1 |
+<?xml version="1.0"?> |
|
2 |
+<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en"> |
|
3 |
+ <siteinfo> |
|
4 |
+ <sitename>Wiktionary</sitename> |
|
5 |
+ <dbname>enwiktionary</dbname> |
|
6 |
+ <base>https://en.wiktionary.org/wiki/Wiktionary:Main_Page</base> |
|
7 |
+ <generator>MediaWiki 1.30.0-wmf.13</generator> |
|
8 |
+ <case>case-sensitive</case> |
|
9 |
+ <namespaces> |
|
10 |
+ <namespace key="-2" case="case-sensitive">Media</namespace> |
|
11 |
+ <namespace key="-1" case="first-letter">Special</namespace> |
|
12 |
+ <namespace key="0" case="case-sensitive" /> |
|
13 |
+ <namespace key="1" case="case-sensitive">Talk</namespace> |
|
14 |
+ <namespace key="2" case="first-letter">User</namespace> |
|
15 |
+ <namespace key="3" case="first-letter">User talk</namespace> |
|
16 |
+ <namespace key="4" case="case-sensitive">Wiktionary</namespace> |
|
17 |
+ <namespace key="5" case="case-sensitive">Wiktionary talk</namespace> |
|
18 |
+ <namespace key="6" case="case-sensitive">File</namespace> |
|
19 |
+ <namespace key="7" case="case-sensitive">File talk</namespace> |
|
20 |
+ <namespace key="8" case="first-letter">MediaWiki</namespace> |
|
21 |
+ <namespace key="9" case="first-letter">MediaWiki talk</namespace> |
|
22 |
+ <namespace key="10" case="case-sensitive">Template</namespace> |
|
23 |
+ <namespace key="11" case="case-sensitive">Template talk</namespace> |
|
24 |
+ <namespace key="12" case="case-sensitive">Help</namespace> |
|
25 |
+ <namespace key="13" case="case-sensitive">Help talk</namespace> |
|
26 |
+ <namespace key="14" case="case-sensitive">Category</namespace> |
|
27 |
+ <namespace key="15" case="case-sensitive">Category talk</namespace> |
|
28 |
+ <namespace key="90" case="case-sensitive">Thread</namespace> |
|
29 |
+ <namespace key="91" case="case-sensitive">Thread talk</namespace> |
|
30 |
+ <namespace key="92" case="case-sensitive">Summary</namespace> |
|
31 |
+ <namespace key="93" case="case-sensitive">Summary talk</namespace> |
|
32 |
+ <namespace key="100" case="case-sensitive">Appendix</namespace> |
|
33 |
+ <namespace key="101" case="case-sensitive">Appendix talk</namespace> |
|
34 |
+ <namespace key="102" case="case-sensitive">Concordance</namespace> |
|
35 |
+ <namespace key="103" case="case-sensitive">Concordance talk</namespace> |
|
36 |
+ <namespace key="104" case="case-sensitive">Index</namespace> |
|
37 |
+ <namespace key="105" case="case-sensitive">Index talk</namespace> |
|
38 |
+ <namespace key="106" case="case-sensitive">Rhymes</namespace> |
|
39 |
+ <namespace key="107" case="case-sensitive">Rhymes talk</namespace> |
|
40 |
+ <namespace key="108" case="case-sensitive">Transwiki</namespace> |
|
41 |
+ <namespace key="109" case="case-sensitive">Transwiki talk</namespace> |
|
42 |
+ <namespace key="110" case="case-sensitive">Wikisaurus</namespace> |
|
43 |
+ <namespace key="111" case="case-sensitive">Wikisaurus talk</namespace> |
|
44 |
+ <namespace key="114" case="case-sensitive">Citations</namespace> |
|
45 |
+ <namespace key="115" case="case-sensitive">Citations talk</namespace> |
|
46 |
+ <namespace key="116" case="case-sensitive">Sign gloss</namespace> |
|
47 |
+ <namespace key="117" case="case-sensitive">Sign gloss talk</namespace> |
|
48 |
+ <namespace key="118" case="case-sensitive">Reconstruction</namespace> |
|
49 |
+ <namespace key="119" case="case-sensitive">Reconstruction talk</namespace> |
|
50 |
+ <namespace key="828" case="case-sensitive">Module</namespace> |
|
51 |
+ <namespace key="829" case="case-sensitive">Module talk</namespace> |
|
52 |
+ <namespace key="2300" case="case-sensitive">Gadget</namespace> |
|
53 |
+ <namespace key="2301" case="case-sensitive">Gadget talk</namespace> |
|
54 |
+ <namespace key="2302" case="case-sensitive">Gadget definition</namespace> |
|
55 |
+ <namespace key="2303" case="case-sensitive">Gadget definition talk</namespace> |
|
56 |
+ <namespace key="2600" case="first-letter">Topic</namespace> |
|
57 |
+ </namespaces> |
|
58 |
+ </siteinfo> |
|
59 |
+ <page> |
|
60 |
+ <title>Reconstruction:Proto-Germanic/hungruz</title> |
|
61 |
+ <ns>118</ns> |
|
62 |
+ <id>3931063</id> |
|
63 |
+ <revision> |
|
64 |
+ <id>46910871</id> |
|
65 |
+ <parentid>42165964</parentid> |
|
66 |
+ <timestamp>2017-06-29T17:01:25Z</timestamp> |
|
67 |
+ <contributor> |
|
68 |
+ <username>Kennybot</username> |
|
69 |
+ <id>1464621</id> |
|
70 |
+ </contributor> |
|
71 |
+ <minor/> |
|
72 |
+ <comment>/* Declension */replaced: {{gem-decl-noun-u-mf|hungr}} → {{gem-decl-noun}} using [[Project:AWB|AWB]]</comment> |
|
73 |
+ <model>wikitext</model> |
|
74 |
+ <format>text/x-wiki</format> |
|
75 |
+ <text xml:space="preserve" bytes="1564">{{reconstructed}} |
|
76 |
+==Proto-Germanic== |
|
77 |
+ |
|
78 |
+===Alternative forms=== |
|
79 |
+* {{l|gem-pro|*hunhruz}} |
|
80 |
+ |
|
81 |
+===Etymology=== |
|
82 |
+From earlier {{m|gem-pro|*hunhruz}}, from {{etyl|ine-pro|gem-pro}} {{m|ine-pro|*kank-}}, {{m|ine-pro|*kenk-||to burn, dry, pain, desire, hunger, thirst}}. Cognate with {{cog|lt|kenkti||to damage, blight}}, {{cog|sa|काङ्क्षति||he wishes, desires}}. |
|
83 |
+ |
|
84 |
+===Noun=== |
|
85 |
+{{gem-noun|m}} |
|
86 |
+ |
|
87 |
+# [[hunger]] |
|
88 |
+ |
|
89 |
+====Declension==== |
|
90 |
+{{gem-decl-noun}} |
|
91 |
+ |
|
92 |
+====Derived terms==== |
|
93 |
+* {{l|gem-pro|*hungrijaną}} |
|
94 |
+* {{l|gem-pro|*hungrugaz}}/{{l|gem-pro|*hungragaz}} |
|
95 |
+ |
|
96 |
+====Descendants==== |
|
97 |
+* Old English: {{l|ang|hungor}} |
|
98 |
+** Middle English: {{l|enm|hunger}}, {{l|enm|honger}}, {{l|enm|hounger}} |
|
99 |
+*** Scots: {{l|sco|hounger}}, {{l|sco|hunger}} |
|
100 |
+*** English: {{l|en|hunger}} |
|
101 |
+* Old Frisian: {{l|ofs|hunger}} |
|
102 |
+** Saterland Frisian: ({{l|stq|Geehunger}}) |
|
103 |
+** West Frisian: {{l|fy|honger}}, {{l|fy|hûnger}} |
|
104 |
+* Old Saxon: {{l|osx|hungar}} |
|
105 |
+** Middle Low German: {{l|gml|hunger}} |
|
106 |
+*** Low German: {{l|nds-de|Hunger}} |
|
107 |
+* Old Dutch: {{l|odt|*hungar}}, {{l|odt|hunger}} |
|
108 |
+** Middle Dutch: {{l|dum|hunger}}, {{l|dum|honger}} |
|
109 |
+*** Dutch: {{l|nl|honger}} |
|
110 |
+* Old High German: {{l|goh|hungar}} |
|
111 |
+** Middle High German: {{l|gmh|hunger}} |
|
112 |
+*** German: {{l|de|Hunger}} |
|
113 |
+*** Luxembourgish: {{l|lb|Honger}} |
|
114 |
+*** Yiddish: {{l|yi|הונגער}} |
|
115 |
+* Old Norse: {{l|non|hungr}} |
|
116 |
+** Icelandic: {{l|is|hungur}} |
|
117 |
+** Norwegian: {{l|no|hunger}} |
|
118 |
+** Old Swedish: {{l|gmq-osw|hunger}} |
|
119 |
+*** Swedish: {{l|sv|hunger}} |
|
120 |
+** Danish: {{l|da|hunger}} |
|
121 |
+* Gothic: *hunhrus > {{l|got|𐌷𐌿𐌷𐍂𐌿𐍃|tr=hūhrus}}, *huggrus (in derivatives)</text> |
|
122 |
+ <sha1>67s06inul2cv24rnf7g5vsvzun07ati</sha1> |
|
123 |
+ </revision> |
|
124 |
+ </page> |
|
125 |
+</mediawiki> |
... | ... |
@@ -0,0 +1,51 @@ |
1 |
+__author__ = 'Ray' |
|
2 |
+ |
|
3 |
+import sys |
|
4 |
+import io |
|
5 |
+import re |
|
6 |
+import urllib.request |
|
7 |
+import urllib.parse |
|
8 |
+import xml.etree.ElementTree as ET |
|
9 |
+ |
|
10 |
+ |
|
11 |
+def get_descendants(title): |
|
12 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific title (usually word)""" |
|
13 |
+ request_parameters = { |
|
14 |
+ "action": "query", |
|
15 |
+ "format": "xml", |
|
16 |
+ "export": 1, |
|
17 |
+ "exportnowrap": 1, |
|
18 |
+ "titles": title, |
|
19 |
+ "redirects": 1 |
|
20 |
+ } |
|
21 |
+ request_data = urllib.parse.urlencode(request_parameters) |
|
22 |
+ request_data = request_data.encode('utf8') |
|
23 |
+ wikipage_request = urllib.request.Request("https://en.wiktionary.org/w/api.php/", request_data) |
|
24 |
+ with urllib.request.urlopen(wikipage_request) as wikipage_response: |
|
25 |
+ # with open("Wiktionary-20170813141826.xml", "r", encoding="utf-8") as wikipage_response: |
|
26 |
+ wikipage_string = wikipage_response.read() |
|
27 |
+ root = ET.fromstring(wikipage_string) |
|
28 |
+ ns = {"mwns": "http://www.mediawiki.org/xml/export-0.10/"} |
|
29 |
+ # wikipage_text_string = root.find("./mwns:page/mwns:title", ns).text |
|
30 |
+ wikipage_text_string = root.find("./mwns:page/mwns:revision/mwns:text", ns).text |
|
31 |
+ # ET.dump(root) |
|
32 |
+ # print(wikipage_text_string) |
|
33 |
+ match = re.search(r"\=+Descendants\=+", wikipage_text_string) |
|
34 |
+ descendants_string = wikipage_text_string[match.end():-1] |
|
35 |
+ descendant_words = [] |
|
36 |
+ # {{l|ang|hungor}} start_tag = "{{" end_tag = "}}" parameter_tag = "|" |
|
37 |
+ descendants_buffer = io.StringIO(descendants_string) |
|
38 |
+ for line in descendants_buffer: |
|
39 |
+ matches = re.findall(r"(?<={{).*?(?=}})", |
|
40 |
+ line) # find all matches for every string between {{ and }} in a non-greedy manner |
|
41 |
+ for match in matches: |
|
42 |
+ match_split = match.split("|") |
|
43 |
+ if (match_split[0] == "l"): |
|
44 |
+ descendant_words.append({"language": match_split[1], "lemma": match_split[2]}) |
|
45 |
+ return descendant_words |
|
46 |
+ |
|
47 |
+ |
|
48 |
+def get_PG_decendants(word): |
|
49 |
+ """Get and parse the descendant section from en.wiktionary.org for a specific gem-pro word""" |
|
50 |
+ pg_prefix = "Reconstruction:Proto-Germanic/" |
|
51 |
+ return get_descendants(pg_prefix + word) |
... | ... |
@@ -0,0 +1,81 @@ |
1 |
+# encoding: utf-8 |
|
2 |
+ |
|
3 |
+import sys |
|
4 |
+from difflib import SequenceMatcher |
|
5 |
+import io |
|
6 |
+import re |
|
7 |
+import argparse |
|
8 |
+import urllib.request |
|
9 |
+import urllib.parse |
|
10 |
+import xml.etree.ElementTree as ET |
|
11 |
+from descendants import get_PG_decendants, get_descendants |
|
12 |
+ |
|
13 |
+globalvars = { |
|
14 |
+ "escape": False, |
|
15 |
+ "data": { |
|
16 |
+ "words": [ |
|
17 |
+ {"id": 0, "language": "gem", "lemma": "bītaną"}, |
|
18 |
+ {"id": 1, "language": "gem", "lemma": "hrītaną"}, |
|
19 |
+ {"id": 2, "language": "deu", "lemma": "beißen"}, |
|
20 |
+ {"id": 3, "language": "deu", "lemma": "reißen"}, |
|
21 |
+ {"id": 4, "language": "nld", "lemma": "bijten"}, |
|
22 |
+ {"id": 5, "language": "nld", "lemma": "rijten"}, |
|
23 |
+ {"id": 6, "language": "deu", "lemma": "scheißen"}, |
|
24 |
+ {"id": 7, "language": "gem", "lemma": "skītaną"}, |
|
25 |
+ {"id": 8, "language": "nld", "lemma": "schijten"}, |
|
26 |
+ ], |
|
27 |
+ "origins": [ |
|
28 |
+ {"from": 0, "to": 2}, |
|
29 |
+ {"from": 1, "to": 3}, |
|
30 |
+ {"from": 0, "to": 4}, |
|
31 |
+ {"from": 1, "to": 5}, |
|
32 |
+ {"from": 7, "to": 6}, |
|
33 |
+ {"from": 7, "to": 8}, |
|
34 |
+ ] |
|
35 |
+ } |
|
36 |
+} |
|
37 |
+ |
|
38 |
+ |
|
39 |
+def myformat(string): |
|
40 |
+ global globalvars |
|
41 |
+ return (string.encode("unicode_escape") if globalvars["escape"] else string) |
|
42 |
+ |
|
43 |
+ |
|
44 |
+def compare(wordpair): |
|
45 |
+ """Compare two words of the same language and find the biggest matching block. |
|
46 |
+ e.g. PG bītaną and hrītaną share "ītaną" |
|
47 |
+ """ |
|
48 |
+ analyzation = SequenceMatcher(None, wordpair[0]["lemma"], wordpair[1]["lemma"]) |
|
49 |
+ print("\ncomparing '%s' and '%s' ..." % (myformat(wordpair[0]["lemma"]), myformat(wordpair[1]["lemma"]))) |
|
50 |
+ blocks = analyzation.get_matching_blocks() |
|
51 |
+ for block in blocks: |
|
52 |
+ print("\tcurrent block: %s" % (str(block))) |
|
53 |
+ for index in range(len(wordpair)): |
|
54 |
+ word = wordpair[index] |
|
55 |
+ size = block[2] |
|
56 |
+ if (size > 0): |
|
57 |
+ begin = block[index] |
|
58 |
+ end = begin+size |
|
59 |
+ print("\t\t'%s'[%u:%u] = %s" % (myformat(word["lemma"]), begin, end, myformat(word["lemma"][begin:end]))) |
|
60 |
+ |
|
61 |
+ |
|
62 |
+def main(args): |
|
63 |
+ global globalvars |
|
64 |
+ parser = argparse.ArgumentParser() |
|
65 |
+ parser.add_argument("--escape", dest="escape", action="store_const", const=sum, default=False, help="enable escape") |
|
66 |
+ parameters = parser.parse_args(args) |
|
67 |
+ globalvars["escape"] = parameters.escape |
|
68 |
+ if (len(args) > 0): |
|
69 |
+ escape = False |
|
70 |
+ for i in range(0, len(globalvars["data"]["words"])): |
|
71 |
+ word1 = globalvars["data"]["words"][i] |
|
72 |
+ for j in range(i+1, len(globalvars["data"]["words"])): |
|
73 |
+ word2 = globalvars["data"]["words"][j] |
|
74 |
+ if (word1["language"] == word2["language"]): #if languages match |
|
75 |
+ wordpair = (word1, word2) |
|
76 |
+ compare(wordpair) #compare the pair |
|
77 |
+ result = list(map(lambda word: {"language": word["language"], "lemma": myformat(word["lemma"])}, get_PG_decendants("hrītaną"))) |
|
78 |
+ print(str(result)) |
|
79 |
+ |
|
80 |
+main(sys.argv[1:]) |
|
81 |
+ |
|
0 | 82 |