dictd-wordnet: Make wordnet available via local dictd.

authored by Petr Rockai and committed by Vladimír Čunát c7fbe024 a7123fc2

+357
+36
pkgs/servers/dict/dictd-wordnet.nix
···
··· 1 + {stdenv, fetchsvn, python, wordnet, writeScript}: 2 + 3 + stdenv.mkDerivation rec { 4 + version = "542"; 5 + name = "dict-db-wordnet-${version}"; 6 + 7 + buildInputs = [python wordnet]; 8 + convert = ./wordnet_structures.py; 9 + 10 + builder = writeScript "builder.sh" '' 11 + . ${stdenv}/setup 12 + ensureDir $out/share/dictd/ 13 + cd $out/share/dictd 14 + 15 + for i in ${wordnet}/dict/data.*; do 16 + DATA="$DATA `echo $i | sed -e s,data,index,` $i"; 17 + done 18 + 19 + python ${convert} $DATA 20 + echo en_US.UTF-8 > locale 21 + ''; 22 + 23 + meta = { 24 + description = "dictd-compatible version of WordNet"; 25 + 26 + longDescription = 27 + '' WordNet® is a large lexical database of English. This package makes 28 + the wordnet data available to dictd and by extension for lookup with 29 + the dict command. ''; 30 + 31 + homepage = http://wordnet.princeton.edu/; 32 + 33 + maintainers = [ stdenv.lib.maintainers.mornfall ]; 34 + platforms = stdenv.lib.platforms.all; 35 + }; 36 + }
+319
pkgs/servers/dict/wordnet_structures.py
···
··· 1 + #!/usr/bin/env python 2 + #Copyright 2007 Sebastian Hagen 3 + # This file is part of wordnet_tools. 4 + 5 + # wordnet_tools is free software; you can redistribute it and/or modify 6 + # it under the terms of the GNU General Public License version 2 7 + # as published by the Free Software Foundation 8 + 9 + # wordnet_tools is distributed in the hope that it will be useful, 10 + # but WITHOUT ANY WARRANTY; without even the implied warranty of 11 + # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 12 + # GNU General Public License for more details. 13 + 14 + # You should have received a copy of the GNU General Public License 15 + # along with wordnet_tools; if not, write to the Free Software 16 + # Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA 17 + 18 + # This program requires python >= 2.4. 19 + 20 + # This program converts wordnet index/data file pairs into dict index/data 21 + # files usable by dictd. 22 + # This is basically a reimplementation of the wnfilter program by Rik Faith, 23 + # which unfortunately doesn't work correctly for wordnet files in the newer 24 + # formats. This version of wordnet_structures whould parse wordnet 2.1 files 25 + # correctly, and create output very similar to what wnfilter would have 26 + # written. 27 + 28 + import datetime 29 + from textwrap import TextWrapper 30 + 31 + CAT_ADJECTIVE = 0 32 + CAT_ADVERB = 1 33 + CAT_NOUN = 2 34 + CAT_VERB = 3 35 + 36 + category_map = { 37 + 'n': CAT_NOUN, 38 + 'v': CAT_VERB, 39 + 'a': CAT_ADJECTIVE, 40 + 's': CAT_ADJECTIVE, 41 + 'r': CAT_ADVERB 42 + } 43 + 44 + 45 + class WordIndex: 46 + def __init__(self, lemma, category, ptrs, synsets, tagsense_count): 47 + self.lemma = lemma 48 + self.category = category 49 + self.ptrs = ptrs 50 + self.synsets = synsets 51 + self.tagsense_count = tagsense_count 52 + 53 + @classmethod 54 + def build_from_line(cls, line_data, synset_map): 55 + line_split = line_data.split() 56 + lemma = line_split[0] 57 + category = category_map[line_split[1]] 58 + synset_count = int(line_split[2],10) 59 + ptr_count = int(line_split[3],10) 60 + ptrs = [line_split[i] for i in range(3, 3+ptr_count)] 61 + tagsense_count = int(line_split[5 + ptr_count],10) 62 + synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)] 63 + return cls(lemma, category, ptrs, synsets, tagsense_count) 64 + 65 + @classmethod 66 + def build_from_file(cls, f, synset_map, rv_base=None): 67 + if (rv_base is None): 68 + rv = {} 69 + else: 70 + rv = rv_base 71 + 72 + for line in f: 73 + if (line.startswith(' ')): 74 + continue 75 + wi = cls.build_from_line(line, synset_map) 76 + word = wi.lemma.lower() 77 + if not (word in rv): 78 + rv[word] = [] 79 + rv[word].append(wi) 80 + return rv 81 + 82 + def __repr__(self): 83 + return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count)) 84 + 85 + 86 + class WordIndexDictFormatter(WordIndex): 87 + category_map_rev = { 88 + CAT_NOUN: 'n', 89 + CAT_VERB: 'v', 90 + CAT_ADJECTIVE: 'adj', 91 + CAT_ADVERB: 'adv' 92 + } 93 + linesep = '\n' 94 + LINE_WIDTH_MAX = 68 95 + prefix_fmtf_line_first = '%5s 1: ' 96 + prefix_fmtn_line_first = ' ' 97 + prefix_fmtf_line_nonfirst = '%5d: ' 98 + prefix_fmtn_line_nonfirst = ' ' 99 + 100 + def dict_str(self): 101 + tw = TextWrapper(width=self.LINE_WIDTH_MAX, 102 + initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]), 103 + subsequent_indent=self.prefix_fmtn_line_first) 104 + 105 + lines = (tw.wrap(self.synsets[0].dict_str())) 106 + i = 2 107 + for synset in self.synsets[1:]: 108 + tw = TextWrapper(width=self.LINE_WIDTH_MAX, 109 + initial_indent=(self.prefix_fmtf_line_nonfirst % i), 110 + subsequent_indent=self.prefix_fmtn_line_nonfirst) 111 + lines.extend(tw.wrap(synset.dict_str())) 112 + i += 1 113 + return self.linesep.join(lines) 114 + 115 + 116 + class Synset: 117 + def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()): 118 + self.offset = offset 119 + self.type = ss_type 120 + self.words = words 121 + self.ptrs = ptrs 122 + self.gloss = gloss 123 + self.frames = frames 124 + self.comments = [] 125 + 126 + @classmethod 127 + def build_from_line(cls, line_data): 128 + line_split = line_data.split() 129 + synset_offset = int(line_split[0],10) 130 + ss_type = category_map[line_split[2]] 131 + word_count = int(line_split[3],16) 132 + words = [line_split[i] for i in range(4, 4 + word_count*2,2)] 133 + ptr_count = int(line_split[4 + word_count*2],10) 134 + ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)] 135 + 136 + tok = line_split[5 + word_count*2 + ptr_count*4] 137 + base = 6 + word_count*2 + ptr_count*4 138 + if (tok != '|'): 139 + frame_count = int(tok, 10) 140 + frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)] 141 + base += frame_count*3 + 1 142 + else: 143 + frames = [] 144 + 145 + line_split2 = line_data.split(None, base) 146 + if (len(line_split2) < base): 147 + gloss = None 148 + else: 149 + gloss = line_split2[-1] 150 + 151 + return cls(synset_offset, ss_type, words, ptrs, gloss, frames) 152 + 153 + @classmethod 154 + def build_from_file(cls, f): 155 + rv = {} 156 + comments = [] 157 + 158 + for line in f: 159 + if (line.startswith(' ')): 160 + line_s = line.lstrip().rstrip('\n') 161 + line_elements = line_s.split(None,1) 162 + try: 163 + int(line_elements[0]) 164 + except ValueError: 165 + continue 166 + if (len(line_elements) == 1): 167 + line_elements.append('') 168 + comments.append(line_elements[1]) 169 + continue 170 + synset = cls.build_from_line(line.rstrip()) 171 + rv[synset.offset] = synset 172 + 173 + return (rv, comments) 174 + 175 + def dict_str(self): 176 + rv = self.gloss 177 + if (len(self.words) > 1): 178 + rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words])) 179 + return rv 180 + 181 + def __repr__(self): 182 + return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames)) 183 + 184 + 185 + class WordnetDict: 186 + db_info_fmt = '''This file was converted from the original database on: 187 + %(conversion_datetime)s 188 + 189 + The original data is available from: 190 + %(wn_url)s 191 + 192 + The original data was distributed with the notice shown below. No 193 + additional restrictions are claimed. Please redistribute this changed 194 + version under the same conditions and restriction that apply to the 195 + original version.\n\n 196 + %(wn_license)s''' 197 + 198 + datetime_fmt = '%Y-%m-%dT%H:%M:%S' 199 + base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/' 200 + 201 + def __init__(self, wn_url, desc_short, desc_long): 202 + self.word_data = {} 203 + self.wn_url = wn_url 204 + self.desc_short = desc_short 205 + self.desc_long = desc_long 206 + self.wn_license = None 207 + 208 + def wn_dict_add(self, file_index, file_data): 209 + file_data.seek(0) 210 + file_index.seek(0) 211 + (synsets, license_lines) = Synset.build_from_file(file_data) 212 + WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data) 213 + if (license_lines): 214 + self.wn_license = '\n'.join(license_lines) + '\n' 215 + 216 + @classmethod 217 + def base64_encode(cls, i): 218 + """Encode a non-negative integer into a dictd compatible base64 string""" 219 + if (i < 0): 220 + raise ValueError('Value %r for i is negative' % (i,)) 221 + r = 63 222 + e = 1 223 + while (r < i): 224 + e += 1 225 + r = 64**e - 1 226 + 227 + rv = '' 228 + while (e > 0): 229 + e -= 1 230 + d = (i / 64**e) 231 + rv += cls.base64_map[d] 232 + i = i % (64**e) 233 + return rv 234 + 235 + @classmethod 236 + def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'): 237 + """Write a single dict entry for <key> to index and data files""" 238 + entry_start = file_data.tell() 239 + file_data.write(entry) 240 + entry_len = len(entry) 241 + file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start), 242 + cls.base64_encode(entry_len), linesep)) 243 + 244 + def dict_generate(self, file_index, file_data): 245 + file_index.seek(0) 246 + file_data.seek(0) 247 + # The dictd file format is fairly iffy on the subject of special 248 + # headwords: either dictd is buggy, or the manpage doesn't tell the whole 249 + # story about the format. 250 + # The upshot is that order of these entries in the index *matters*. 251 + # Putting them at the beginning and in alphabetic order is afaict ok. 252 + # Some other orders completely and quietly break the ability to look 253 + # those headwords up. 254 + # -- problem encountered with 1.10.2, at 2007-08-05. 255 + file_data.write('\n') 256 + wn_url = self.wn_url 257 + conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt) 258 + wn_license = self.wn_license 259 + self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars())) 260 + self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long) 261 + self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short) 262 + self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url) 263 + 264 + 265 + words = self.word_data.keys() 266 + words.sort() 267 + for word in words: 268 + for wi in self.word_data[word]: 269 + word_cs = word 270 + # Use case-sensitivity information of first entry of first synset that 271 + # matches this word case-insensitively 272 + for synset in wi.synsets: 273 + for ss_word in synset.words: 274 + if (ss_word.lower() == word_cs.lower()): 275 + word_cs = ss_word 276 + break 277 + else: 278 + continue 279 + break 280 + else: 281 + continue 282 + break 283 + 284 + outstr = '' 285 + for wi in self.word_data[word]: 286 + outstr += wi.dict_str() + '\n' 287 + 288 + outstr = '%s%s%s' % (word_cs, wi.linesep, outstr) 289 + self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep) 290 + 291 + file_index.truncate() 292 + file_data.truncate() 293 + 294 + 295 + if (__name__ == '__main__'): 296 + import optparse 297 + op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+') 298 + op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to') 299 + op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to') 300 + op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources') 301 + op.add_option('--db_desc_short', dest='desc_short', default=' WordNet (r) 2.1 (2005)', help='short dict DB description') 302 + op.add_option('--db_desc_long', dest='desc_long', default=' WordNet (r): A Lexical Database for English from the\n Cognitive Science Laboratory at Princeton University', help='long dict DB description') 303 + 304 + (options, args) = op.parse_args() 305 + 306 + wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long) 307 + 308 + for i in range(0,len(args),2): 309 + print 'Opening index file %r...' % args[i] 310 + file_index = file(args[i]) 311 + print 'Opening data file %r...' % args[i+1] 312 + file_data = file(args[i+1]) 313 + print 'Parsing index file and data file...' 314 + wnd.wn_dict_add(file_index, file_data) 315 + 316 + print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od) 317 + 318 + wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w')) 319 + print 'All done.'
+2
pkgs/top-level/all-packages.nix
··· 5512 inherit stdenv lib dict; 5513 }; 5514 5515 dovecot = callPackage ../servers/mail/dovecot { }; 5516 5517 ejabberd = callPackage ../servers/xmpp/ejabberd { };
··· 5512 inherit stdenv lib dict; 5513 }; 5514 5515 + dictdWordnet = callPackage ../servers/dict/dictd-wordnet.nix {}; 5516 + 5517 dovecot = callPackage ../servers/mail/dovecot { }; 5518 5519 ejabberd = callPackage ../servers/xmpp/ejabberd { };