dictd-wiktionary: Let dictd serve an offline copy of wiktionary.

@vcunat: add -O to python, as it takes lots of time to process

authored by Petr Rockai and committed by Vladimír Čunát b5451c73 c7fbe024

+812
+32
pkgs/servers/dict/dictd-wiktionary.nix
···
··· 1 + {stdenv, fetchurl, python, dict, glibcLocales, writeScript}: 2 + 3 + stdenv.mkDerivation rec { 4 + version = "20121021"; 5 + name = "dict-db-wiktionary-${version}"; 6 + data = fetchurl { 7 + url = "http://dumps.wikimedia.org/enwiktionary/${version}/enwiktionary-${version}-pages-articles.xml.bz2"; 8 + sha256 = "1i4xwdpc2bx58495iy62iz0kn50c3qmnh4qribi82f2rd4qkfjd2"; 9 + }; 10 + 11 + convert = ./wiktionary2dict.py; 12 + buildInputs = [python dict glibcLocales]; 13 + 14 + builder = writeScript "wiktionary-builder.sh" '' 15 + source $stdenv/setup 16 + 17 + ensureDir $out/share/dictd/ 18 + cd $out/share/dictd 19 + 20 + export LOCALE_ARCHIVE=${glibcLocales}/lib/locale/locale-archive 21 + python -O ${convert} ${data} 22 + dictzip wiktionary-en.dict 23 + echo en_US.UTF-8 > locale 24 + ''; 25 + 26 + meta = { 27 + description = "DICT version of English Wiktionary"; 28 + homepage = http://en.wiktionary.org/; 29 + maintainers = [ stdenv.lib.maintainers.mornfall ]; 30 + platforms = stdenv.lib.platforms.all; 31 + }; 32 + }
+778
pkgs/servers/dict/wiktionary2dict.py
···
··· 1 + # Adapted to produce DICT-compatible files by Petr Rockai in 2012 2 + # Based on code from wiktiondict by Greg Hewgill 3 + import re 4 + import sys 5 + import codecs 6 + import os 7 + import textwrap 8 + import time 9 + import xml.sax 10 + 11 + class Text: 12 + def __init__(self, s): 13 + self.s = s 14 + def process(self): 15 + return s 16 + 17 + class TemplateCall: 18 + def __init__(self): 19 + pass 20 + def process(self): 21 + pass 22 + 23 + class Template: 24 + def __init__(self): 25 + self.parts = [] 26 + def append(self, part): 27 + self.parts.append(part) 28 + def process(self): 29 + return ''.join(x.process() for x in self.parts) 30 + 31 + class Whitespace: 32 + def __init__(self, s): 33 + self.s = s 34 + 35 + class OpenDouble: pass 36 + class OpenTriple: pass 37 + class CloseDouble: pass 38 + class CloseTriple: pass 39 + 40 + class Equals: 41 + def __str__(self): 42 + return "=" 43 + 44 + class Delimiter: 45 + def __init__(self, c): 46 + self.c = c 47 + def __str__(self): 48 + return self.c 49 + 50 + def Tokenise(s): 51 + s = unicode(s) 52 + stack = [] 53 + last = 0 54 + i = 0 55 + while i < len(s): 56 + if s[i] == '{' and i+1 < len(s) and s[i+1] == '{': 57 + if i > last: 58 + yield s[last:i] 59 + if i+2 < len(s) and s[i+2] == '{': 60 + yield OpenTriple() 61 + stack.append(3) 62 + i += 3 63 + else: 64 + yield OpenDouble() 65 + stack.append(2) 66 + i += 2 67 + last = i 68 + elif s[i] == '}' and i+1 < len(s) and s[i+1] == '}': 69 + if i > last: 70 + yield s[last:i] 71 + if len(stack) == 0: 72 + yield "}}" 73 + i += 2 74 + elif stack[-1] == 2: 75 + yield CloseDouble() 76 + i += 2 77 + stack.pop() 78 + elif i+2 < len(s) and s[i+2] == '}': 79 + yield CloseTriple() 80 + i += 3 81 + stack.pop() 82 + else: 83 + raise SyntaxError() 84 + last = i 85 + elif s[i] == ':' or s[i] == '|': 86 + if i > last: 87 + yield s[last:i] 88 + yield Delimiter(s[i]) 89 + i += 1 90 + last = i 91 + elif s[i] == '=': 92 + if i > last: 93 + yield s[last:i] 94 + yield Equals() 95 + i += 1 96 + last = i 97 + #elif s[i] == ' ' or s[i] == '\t' or s[i] == '\n': 98 + # if i > last: 99 + # yield s[last:i] 100 + # last = i 101 + # m = re.match(r"\s+", s[i:]) 102 + # assert m 103 + # yield Whitespace(m.group(0)) 104 + # i += len(m.group(0)) 105 + # last = i 106 + else: 107 + i += 1 108 + if i > last: 109 + yield s[last:i] 110 + 111 + def processSub(templates, tokens, args): 112 + t = tokens.next() 113 + if not isinstance(t, unicode): 114 + raise SyntaxError 115 + name = t 116 + t = tokens.next() 117 + default = None 118 + if isinstance(t, Delimiter) and t.c == '|': 119 + default = "" 120 + while True: 121 + t = tokens.next() 122 + if isinstance(t, unicode): 123 + default += t 124 + elif isinstance(t, OpenDouble): 125 + default += processTemplateCall(templates, tokens, args) 126 + elif isinstance(t, OpenTriple): 127 + default += processSub(templates, tokens, args) 128 + elif isinstance(t, CloseTriple): 129 + break 130 + else: 131 + print "Unexpected:", t 132 + raise SyntaxError() 133 + if name in args: 134 + return args[name] 135 + if default is not None: 136 + return default 137 + if name == "lang": 138 + return "en" 139 + return "{{{%s}}}" % name 140 + 141 + def processTemplateCall(templates, tokens, args): 142 + template = tokens.next().strip().lower() 143 + args = {} 144 + a = 1 145 + t = tokens.next() 146 + while True: 147 + if isinstance(t, Delimiter): 148 + name = unicode(a) 149 + arg = "" 150 + while True: 151 + t = tokens.next() 152 + if isinstance(t, unicode): 153 + arg += t 154 + elif isinstance(t, OpenDouble): 155 + arg += processTemplateCall(templates, tokens, args) 156 + elif isinstance(t, OpenTriple): 157 + arg += processSub(templates, tokens, args) 158 + elif isinstance(t, Delimiter) and t.c != '|': 159 + arg += str(t) 160 + else: 161 + break 162 + if isinstance(t, Equals): 163 + name = arg.strip() 164 + arg = "" 165 + while True: 166 + t = tokens.next() 167 + if isinstance(t, (unicode, Equals)): 168 + arg += unicode(t) 169 + elif isinstance(t, OpenDouble): 170 + arg += processTemplateCall(templates, tokens, args) 171 + elif isinstance(t, OpenTriple): 172 + arg += processSub(templates, tokens, args) 173 + elif isinstance(t, Delimiter) and t.c != '|': 174 + arg += str(t) 175 + else: 176 + break 177 + arg = arg.strip() 178 + else: 179 + a += 1 180 + args[name] = arg 181 + elif isinstance(t, CloseDouble): 182 + break 183 + else: 184 + print "Unexpected:", t 185 + raise SyntaxError 186 + #print template, args 187 + if template[0] == '#': 188 + if template == "#if": 189 + if args['1'].strip(): 190 + return args['2'] 191 + elif '3' in args: 192 + return args['3'] 193 + else: 194 + return "" 195 + elif template == "#ifeq": 196 + if args['1'].strip() == args['2'].strip(): 197 + return args['3'] 198 + elif '4' in args: 199 + return args['4'] 200 + else: 201 + return "" 202 + elif template == "#ifexist": 203 + return "" 204 + elif template == "#switch": 205 + sw = args['1'].strip() 206 + if sw in args: 207 + return args[sw] 208 + else: 209 + return "" 210 + else: 211 + print "Unknown ParserFunction:", template 212 + sys.exit(1) 213 + if template not in templates: 214 + return "{{%s}}" % template 215 + return process(templates, templates[template], args) 216 + 217 + def process(templates, s, args = {}): 218 + s = re.compile(r"<!--.*?-->", re.DOTALL).sub("", s) 219 + s = re.compile(r"<noinclude>.*?</noinclude>", re.DOTALL).sub("", s) 220 + assert "<onlyinclude>" not in s 221 + #s = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", s) 222 + s = re.compile(r"<includeonly>(.*?)</includeonly>", re.DOTALL).sub(r"\1", s) 223 + r = "" 224 + #print list(Tokenise(s)) 225 + tokens = Tokenise(s) 226 + try: 227 + while True: 228 + t = tokens.next() 229 + if isinstance(t, OpenDouble): 230 + r += processTemplateCall(templates, tokens, args) 231 + elif isinstance(t, OpenTriple): 232 + r += processSub(templates, tokens, args) 233 + else: 234 + r += unicode(t) 235 + except StopIteration: 236 + pass 237 + return r 238 + 239 + def test(): 240 + templates = { 241 + 'lb': "{{", 242 + 'name-example': "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].", 243 + 't': "start-{{{1|pqr}}}-end", 244 + 't0': "start-{{{1}}}-end", 245 + 't1': "start{{{1}}}end<noinclude>moo</noinclude>", 246 + 't2a1': "{{t2demo|a|{{{1}}}}}", 247 + 't2a2': "{{t2demo|a|2={{{1}}}}}", 248 + 't2demo': "start-{{{1}}}-middle-{{{2}}}-end", 249 + 't5': "{{t2demo|{{{a}}}=b}}", 250 + 't6': "t2demo|a", 251 + } 252 + def t(text, expected): 253 + print "text:", text 254 + s = process(templates, text) 255 + if s != expected: 256 + print "got:", s 257 + print "expected:", expected 258 + sys.exit(1) 259 + t("{{Name-example}}", "I am a template example, my first name is '''{{{firstName}}}''' and my last name is '''{{{lastName}}}'''. You can reference my page at [[{{{lastName}}}, {{{firstName}}}]].") 260 + t("{{Name-example | firstName=John | lastName=Smith }}", "I am a template example, my first name is '''John''' and my last name is '''Smith'''. You can reference my page at [[Smith, John]].") 261 + t("{{t0|a}}", "start-a-end") 262 + t("{{t0| }}", "start- -end") 263 + t("{{t0|}}", "start--end") 264 + t("{{t0}}", "start-{{{1}}}-end") 265 + t("{{t0| }}", "start- -end") 266 + t("{{t0|\n}}", "start-\n-end") 267 + t("{{t0|1= }}", "start--end") 268 + t("{{t0|1=\n}}", "start--end") 269 + t("{{T}}", "start-pqr-end") 270 + t("{{T|}}", "start--end") 271 + t("{{T|abc}}", "start-abc-end") 272 + t("{{T|abc|def}}", "start-abc-end") 273 + t("{{T|1=abc|1=def}}", "start-def-end") 274 + t("{{T|abc|1=def}}", "start-def-end") 275 + t("{{T|1=abc|def}}", "start-def-end") 276 + t("{{T|{{T}}}}", "start-start-pqr-end-end") 277 + t("{{T|{{T|{{T}}}}}}", "start-start-start-pqr-end-end-end") 278 + t("{{T|{{T|{{T|{{T}}}}}}}}", "start-start-start-start-pqr-end-end-end-end") 279 + t("{{T|a{{t|b}}}}", "start-astart-b-end-end") 280 + t("{{T|{{T|a=b}}}}", "start-start-pqr-end-end") 281 + t("{{T|a=b}}", "start-pqr-end") 282 + t("{{T|1=a=b}}", "start-a=b-end") 283 + #t("{{t1|{{lb}}tc}}}}", "start{{tcend}}") 284 + #t("{{t2a1|1=x=y}}", "start-a-middle-{{{2}}}-end") 285 + #t("{{t2a2|1=x=y}}", "start-a-middle-x=y-end") 286 + #t("{{t5|a=2=d}}", "start-{{{1}}}-middle-d=b-end") 287 + #t("{{ {{t6}} }}", "{{ t2demo|a }}") 288 + t("{{t|[[a|b]]}}", "start-b-end") 289 + t("{{t|[[a|b]] }}", "start-b -end") 290 + 291 + Parts = { 292 + # Standard POS headers 293 + 'noun': "n.", 294 + 'Noun': "n.", 295 + 'Noun 1': "n.", 296 + 'Noun 2': "n.", 297 + 'Verb': "v.", 298 + 'Adjective': "adj.", 299 + 'Adverb': "adv.", 300 + 'Pronoun': "pron.", 301 + 'Conjunction': "conj.", 302 + 'Interjection': "interj.", 303 + 'Preposition': "prep.", 304 + 'Proper noun': "n.p.", 305 + 'Proper Noun': "n.p.", 306 + 'Article': "art.", 307 + 308 + # Standard non-POS level 3 headers 309 + '{{acronym}}': "acr.", 310 + 'Acronym': "acr.", 311 + '{{abbreviation}}': "abbr.", 312 + '[[Abbreviation]]': "abbr.", 313 + 'Abbreviation': "abbr.", 314 + '[[initialism]]': "init.", 315 + '{{initialism}}': "init.", 316 + 'Initialism': "init.", 317 + 'Contraction': "cont.", 318 + 'Prefix': "prefix", 319 + 'Suffix': "suffix", 320 + 'Symbol': "sym.", 321 + 'Letter': "letter", 322 + 'Idiom': "idiom", 323 + 'Idioms': "idiom", 324 + 'Phrase': "phrase", 325 + 326 + # Debated POS level 3 headers 327 + 'Number': "num.", 328 + 'Numeral': "num.", 329 + 'Cardinal number': "num.", 330 + 'Ordinal number': "num.", 331 + 'Cardinal numeral': "num.", 332 + 'Ordinal numeral': "num.", 333 + 334 + # Other headers in use 335 + 'Personal pronoun': "pers.pron.", 336 + 'Adjective/Adverb': "adj./adv.", 337 + 'Proper adjective': "prop.adj.", 338 + 'Determiner': "det.", 339 + 'Demonstrative determiner': "dem.det.", 340 + 'Clitic': "clitic", 341 + 'Infix': "infix", 342 + 'Counter': "counter", 343 + 'Kanji': None, 344 + 'Kanji reading': None, 345 + 'Hiragana letter': None, 346 + 'Katakana letter': None, 347 + 'Pinyin': None, 348 + 'Han character': None, 349 + 'Hanzi': None, 350 + 'Hanja': None, 351 + 'Proverb': "prov.", 352 + 'Expression': None, 353 + 'Adjectival noun': None, 354 + 'Quasi-adjective': None, 355 + 'Particle': "part.", 356 + 'Infinitive particle': "part.", 357 + 'Possessive adjective': "poss.adj.", 358 + 'Verbal prefix': "v.p.", 359 + 'Postposition': "post.", 360 + 'Prepositional article': "prep.art.", 361 + 'Phrasal verb': "phr.v.", 362 + 'Participle': "participle", 363 + 'Interrogative auxiliary verb': "int.aux.v.", 364 + 'Pronominal adverb': "pron.adv.", 365 + 'Adnominal': "adn.", 366 + 'Abstract pronoun': "abs.pron.", 367 + 'Conjunction particle': None, 368 + 'Root': "root", 369 + 370 + # Non-standard, deprecated headers 371 + 'Noun form': "n.", 372 + 'Verb form': "v.", 373 + 'Adjective form': "adj.form.", 374 + 'Nominal phrase': "nom.phr.", 375 + 'Noun phrase': "n. phrase", 376 + 'Verb phrase': "v. phrase", 377 + 'Transitive verb': "v.t.", 378 + 'Intransitive verb': "v.i.", 379 + 'Reflexive verb': "v.r.", 380 + 'Cmavo': None, 381 + 'Romaji': "rom.", 382 + 'Hiragana': None, 383 + 'Furigana': None, 384 + 'Compounds': None, 385 + 386 + # Other headers seen 387 + 'Alternative forms': None, 388 + 'Alternative spellings': None, 389 + 'Anagrams': None, 390 + 'Antonym': None, 391 + 'Antonyms': None, 392 + 'Conjugation': None, 393 + 'Declension': None, 394 + 'Declension and pronunciations': None, 395 + 'Definite Article': "def.art.", 396 + 'Definite article': "def.art.", 397 + 'Demonstrative pronoun': "dem.pron.", 398 + 'Derivation': None, 399 + 'Derived expression': None, 400 + 'Derived expressions': None, 401 + 'Derived forms': None, 402 + 'Derived phrases': None, 403 + 'Derived terms': None, 404 + 'Derived, Related terms': None, 405 + 'Descendants': None, 406 + #'Etymology': None, 407 + #'Etymology 1': None, 408 + #'Etymology 2': None, 409 + #'Etymology 3': None, 410 + #'Etymology 4': None, 411 + #'Etymology 5': None, 412 + 'Examples': None, 413 + 'External links': None, 414 + '[[Gismu]]': None, 415 + 'Gismu': None, 416 + 'Homonyms': None, 417 + 'Homophones': None, 418 + 'Hyphenation': None, 419 + 'Indefinite article': "art.", 420 + 'Indefinite pronoun': "ind.pron.", 421 + 'Indefinite Pronoun': "ind.pron.", 422 + 'Indetermined pronoun': "ind.pron.", 423 + 'Interrogative conjunction': "int.conj.", 424 + 'Interrogative determiner': "int.det.", 425 + 'Interrogative particle': "int.part.", 426 + 'Interrogative pronoun': "int.pron.", 427 + 'Legal expression': "legal", 428 + 'Mass noun': "n.", 429 + 'Miscellaneous': None, 430 + 'Mutations': None, 431 + 'Noun and verb': "n/v.", 432 + 'Other language': None, 433 + 'Pinyin syllable': None, 434 + 'Possessive determiner': "poss.det.", 435 + 'Possessive pronoun': "poss.pron.", 436 + 'Prepositional phrase': "prep.phr.", 437 + 'Prepositional Pronoun': "prep.pron.", 438 + 'Pronunciation': None, 439 + 'Pronunciation 1': None, 440 + 'Pronunciation 2': None, 441 + 'Quotations': None, 442 + 'References': None, 443 + 'Reflexive pronoun': "refl.pron.", 444 + 'Related expressions': None, 445 + 'Related terms': None, 446 + 'Related words': None, 447 + 'Relative pronoun': "rel.pron.", 448 + 'Saying': "saying", 449 + 'See also': None, 450 + 'Shorthand': None, 451 + '[http://en.wikipedia.org/wiki/Shorthand Shorthand]': None, 452 + 'Sister projects': None, 453 + 'Spelling note': None, 454 + 'Synonyms': None, 455 + 'Translation': None, 456 + 'Translations': None, 457 + 'Translations to be checked': None, 458 + 'Transliteration': None, 459 + 'Trivia': None, 460 + 'Usage': None, 461 + 'Usage in English': None, 462 + 'Usage notes': None, 463 + 'Verbal noun': "v.n.", 464 + } 465 + PartsUsed = {} 466 + for p in Parts.keys(): 467 + PartsUsed[p] = 0 468 + 469 + def encode(s): 470 + r = e(s) 471 + assert r[1] == len(s) 472 + return r[0] 473 + 474 + def dowikilink(m): 475 + a = m.group(1).split("|") 476 + if len(a) > 1: 477 + link = a[1] 478 + else: 479 + link = a[0] 480 + if ':' in link: 481 + link = "" 482 + return link 483 + 484 + seentemplates = {} 485 + def dotemplate(m): 486 + aa = m.group(1).split("|") 487 + args = {} 488 + n = 0 489 + for a in aa: 490 + am = re.match(r"(.*?)(=(.*))?", a) 491 + if am: 492 + args[am.group(1)] = am.group(3) 493 + else: 494 + n += 1 495 + args[n] = am.group(1) 496 + 497 + #if aa[0] in seentemplates: 498 + # seentemplates[aa[0]] += 1 499 + #else: 500 + # seentemplates[aa[0]] = 1 501 + # print len(seentemplates), aa[0] 502 + #print aa[0] 503 + 504 + #if aa[0] not in Templates: 505 + # return "(unknown template %s)" % aa[0] 506 + #body = Templates[aa[0]] 507 + #body = re.sub(r"<noinclude>.*?</noinclude>", "", body) 508 + #assert "<onlyinclude>" not in body 509 + ##body = re.sub(r"(.*?)<onlyinclude>(.*?)</onlyinclude>(.*)", r"\1", body) 510 + #body = re.sub(r"<includeonly>(.*?)</includeonly>", r"\1", body) 511 + #def dotemplatearg(m): 512 + # ta = m.group(1).split("|") 513 + # if ta[0] in args: 514 + # return args[ta[0]] 515 + # elif len(ta) > 1: 516 + # return ta[1] 517 + # else: 518 + # return "{{{%s}}}" % ta[0] 519 + #body = re.sub(r"{{{(.*?)}}}", dotemplatearg, body) 520 + #return dewiki(body) 521 + 522 + def doparserfunction(m): 523 + a = m.group(2).split("|") 524 + if m.group(1) == "ifeq": 525 + if a[0] == a[1]: 526 + return a[2] 527 + elif len(a) >= 4: 528 + return a[3] 529 + return "" 530 + 531 + def dewiki(body, indent = 0): 532 + # process in this order: 533 + # {{{ }}} 534 + # <> <> 535 + # [[ ]] 536 + # {{ }} 537 + # ''' ''' 538 + # '' '' 539 + #body = wikimediatemplate.process(Templates, body) 540 + body = re.sub(r"\[\[(.*?)\]\]", dowikilink, body) 541 + #body = re.sub(r"{{(.*?)}}", dotemplate, body) 542 + #body = re.sub(r"{{#(.*?):(.*?)}}", doparserfunction, body) 543 + body = re.sub(r"'''(.*?)'''", r"\1", body) 544 + body = re.sub(r"''(.*?)''", r"\1", body) 545 + lines = body.split("\n") 546 + n = 0 547 + i = 0 548 + while i < len(lines): 549 + if len(lines[i]) > 0 and lines[i][0] == "#": 550 + if len(lines[i]) > 1 and lines[i][1] == '*': 551 + wlines = textwrap.wrap(lines[i][2:].strip(), 552 + initial_indent = " * ", 553 + subsequent_indent = " ") 554 + elif len(lines[i]) > 1 and lines[i][1] == ':': 555 + wlines = textwrap.wrap(lines[i][2:].strip(), 556 + initial_indent = " ", 557 + subsequent_indent = " ") 558 + else: 559 + n += 1 560 + wlines = textwrap.wrap(str(n) + ". " + lines[i][1:].strip(), 561 + subsequent_indent = " ") 562 + elif len(lines[i]) > 0 and lines[i][0] == "*": 563 + n = 0 564 + wlines = textwrap.wrap(lines[i][1:].strip(), 565 + initial_indent = "* ", 566 + subsequent_indent = " ") 567 + else: 568 + n = 0 569 + wlines = textwrap.wrap(lines[i].strip()) 570 + if len(wlines) == 0: 571 + wlines = [''] 572 + lines[i:i+1] = wlines 573 + i += len(wlines) 574 + return ''.join(" "*(indent-1)+x+"\n" for x in lines) 575 + 576 + class WikiSection: 577 + def __init__(self, heading, body): 578 + self.heading = heading 579 + self.body = body 580 + #self.lines = re.split("\n+", body.strip()) 581 + #if len(self.lines) == 1 and len(self.lines[0]) == 0: 582 + # self.lines = [] 583 + self.children = [] 584 + def __str__(self): 585 + return "<%s:%i:%s>" % (self.heading, len(self.body or ""), ','.join([str(x) for x in self.children])) 586 + def add(self, section): 587 + self.children.append(section) 588 + 589 + def parse(word, text): 590 + headings = list(re.finditer("^(=+)\s*(.*?)\s*=+\n", text, re.MULTILINE)) 591 + #print [x.group(1) for x in headings] 592 + doc = WikiSection(word, "") 593 + stack = [doc] 594 + for i, m in enumerate(headings): 595 + depth = len(m.group(1)) 596 + if depth < len(stack): 597 + stack = stack[:depth] 598 + else: 599 + while depth > len(stack): 600 + s = WikiSection(None, "") 601 + stack[-1].add(s) 602 + stack.append(s) 603 + if i+1 < len(headings): 604 + s = WikiSection(m.group(2), text[m.end(0):headings[i+1].start(0)].strip()) 605 + else: 606 + s = WikiSection(m.group(2), text[m.end(0):].strip()) 607 + assert len(stack) == depth 608 + stack[-1].add(s) 609 + stack.append(s) 610 + #while doc.heading is None and len(doc.lines) == 0 and len(doc.children) == 1: 611 + # doc = doc.children[0] 612 + return doc 613 + 614 + def formatFull(word, doc): 615 + def f(depth, section): 616 + if section.heading: 617 + r = " "*(depth-1) + section.heading + "\n\n" 618 + else: 619 + r = "" 620 + if section.body: 621 + r += dewiki(section.body, depth+1)+"\n" 622 + #r += "".join(" "*depth + x + "\n" for x in dewiki(section.body)) 623 + #if len(section.lines) > 0: 624 + # r += "\n" 625 + for c in section.children: 626 + r += f(depth+1, c) 627 + return r 628 + s = f(0, doc) 629 + s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word 630 + return s 631 + 632 + def formatNormal(word, doc): 633 + def f(depth, posdepth, section): 634 + r = "" 635 + if depth == posdepth: 636 + if not section.heading or section.heading.startswith("Etymology"): 637 + posdepth += 1 638 + elif section.heading in Parts: 639 + #p = Parts[section.heading] 640 + #if p: 641 + # r += " "*(depth-1) + word + " (" + p + ")\n\n" 642 + r += " "*(depth-1) + section.heading + "\n\n" 643 + else: 644 + print >>errors, "Unknown part: (%s) %s" % (word, section.heading) 645 + return "" 646 + elif depth > posdepth: 647 + return "" 648 + elif section.heading: 649 + r += " "*(depth-1) + section.heading + "\n\n" 650 + if section.body: 651 + r += dewiki(section.body, depth+1)+"\n" 652 + #r += "".join(" "*depth + x + "\n" for x in dewiki(section.lines)) 653 + #if len(section.lines) > 0: 654 + # r += "\n" 655 + for c in section.children: 656 + r += f(depth+1, posdepth, c) 657 + return r 658 + s = f(0, 3, doc) 659 + s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word 660 + return s 661 + 662 + def formatBrief(word, doc): 663 + def f(depth, posdepth, section): 664 + if depth == posdepth: 665 + h = section.heading 666 + if not section.heading or section.heading.startswith("Etymology"): 667 + posdepth += 1 668 + elif section.heading in Parts: 669 + #h = Parts[section.heading] 670 + #if h: 671 + # h = "%s (%s)" % (word, h) 672 + pass 673 + stack.append([h, False]) 674 + elif depth > 0: 675 + stack.append([section.heading, False]) 676 + else: 677 + stack.append(["%h " + section.heading, False]) 678 + r = "" 679 + #if section.heading: 680 + # r += " "*(depth-1) + section.heading + "\n" 681 + body = ''.join(x+"\n" for x in section.body.split("\n") if len(x) > 0 and x[0] == '#') 682 + if len(body) > 0: 683 + for i in range(len(stack)): 684 + if not stack[i][1]: 685 + if stack[i][0]: 686 + r += " "*(i-1) + stack[i][0] + "\n" 687 + stack[i][1] = True 688 + r += dewiki(body, depth+1) 689 + for c in section.children: 690 + r += f(depth+1, posdepth, c) 691 + stack.pop() 692 + return r 693 + stack = [] 694 + s = f(0, 3, doc) 695 + s += "Ref: http://en.wiktionary.org/wiki/%s\n" % word 696 + return s 697 + 698 + class WikiHandler(xml.sax.ContentHandler): 699 + def __init__(self): 700 + self.element = None 701 + self.page = None 702 + self.text = "" 703 + self.long = {} 704 + def startElement(self, name, attrs): 705 + #print "start", name, attrs 706 + self.element = name 707 + def endElement(self, name): 708 + #print "end", name 709 + if self.element == "text": 710 + if self.page: 711 + if self.page in self.long: 712 + print self.page, len(self.text) 713 + print 714 + self.doPage(self.page, self.text) 715 + self.page = None 716 + self.text = "" 717 + self.element = None 718 + def characters(self, content): 719 + #print "characters", content 720 + if self.element == "title": 721 + if self.checkPage(content): 722 + self.page = content 723 + elif self.element == "text": 724 + if self.page: 725 + self.text += content 726 + if len(self.text) > 100000 and self.page not in self.long: 727 + self.long[self.page] = 1 728 + def checkPage(self, page): 729 + return False 730 + def doPage(self, page, text): 731 + pass 732 + 733 + class TemplateHandler(WikiHandler): 734 + def checkPage(self, page): 735 + return page.startswith("Template:") 736 + def doPage(self, page, text): 737 + Templates[page[page.find(':')+1:].lower()] = text 738 + 739 + class WordHandler(WikiHandler): 740 + def checkPage(self, page): 741 + return ':' not in page 742 + def doPage(self, page, text): 743 + m = re.match(r"#redirect\s*\[\[(.*?)\]\]", text, re.IGNORECASE) 744 + if m: 745 + out.write(" See <%s>" % page) 746 + return 747 + doc = parse(page, text) 748 + out.write(formatBrief(page, doc)) 749 + #print formatBrief(page, doc) 750 + 751 + fn = sys.argv[1] 752 + info = """ This file was converted from the original database on: 753 + %s 754 + 755 + The original data is available from: 756 + http://en.wiktionary.org 757 + The version from which this file was generated was: 758 + %s 759 + 760 + Wiktionary is available under the GNU Free Documentation License. 761 + """ % (time.ctime(), os.path.basename(fn)) 762 + 763 + errors = codecs.open("mkdict.err", "w", "utf_8") 764 + e = codecs.getencoder("utf_8") 765 + 766 + Templates = {} 767 + f = os.popen("bunzip2 -c %s" % fn, "r") 768 + xml.sax.parse(f, TemplateHandler()) 769 + f.close() 770 + 771 + f = os.popen("bunzip2 -c %s" % fn, "r") 772 + out = codecs.getwriter("utf_8")( 773 + os.popen("dictfmt -p wiktionary-en --locale en_US.UTF-8 --columns 0 -u http://en.wiktionary.org", "w")) 774 + 775 + out.write(("%%h English Wiktionary\n%s" % info).encode('utf-8')) 776 + xml.sax.parse(f, WordHandler()) 777 + f.close() 778 + out.close()
+2
pkgs/top-level/all-packages.nix
··· 5512 inherit stdenv lib dict; 5513 }; 5514 5515 dictdWordnet = callPackage ../servers/dict/dictd-wordnet.nix {}; 5516 5517 dovecot = callPackage ../servers/mail/dovecot { };
··· 5512 inherit stdenv lib dict; 5513 }; 5514 5515 + dictdWiktionary = callPackage ../servers/dict/dictd-wiktionary.nix {}; 5516 + 5517 dictdWordnet = callPackage ../servers/dict/dictd-wordnet.nix {}; 5518 5519 dovecot = callPackage ../servers/mail/dovecot { };