pkgs/servers/dict/wordnet_structures.py at v192 · pyrox.dev/nixpkgs

pyrox.dev / nixpkgs
lol
nixpkgs / pkgs / servers / dict / wordnet_structures.py
at v192 319 lines 12 kB view raw
  1#!/usr/bin/env python
  2#Copyright 2007 Sebastian Hagen
  3# This file is part of wordnet_tools.
  4
  5# wordnet_tools is free software; you can redistribute it and/or modify
  6# it under the terms of the GNU General Public License version 2
  7# as published by the Free Software Foundation
  8
  9# wordnet_tools is distributed in the hope that it will be useful,
 10# but WITHOUT ANY WARRANTY; without even the implied warranty of
 11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12# GNU General Public License for more details.
 13
 14# You should have received a copy of the GNU General Public License
 15# along with wordnet_tools; if not, write to the Free Software
 16# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17
 18# This program requires python >= 2.4.
 19
 20# This program converts wordnet index/data file pairs into dict index/data
 21# files usable by dictd.
 22# This is basically a reimplementation of the wnfilter program by Rik Faith,
 23# which unfortunately doesn't work correctly for wordnet files in the newer
 24# formats. This version of wordnet_structures whould parse wordnet 2.1 files
 25# correctly, and create output very similar to what wnfilter would have
 26# written.
 27
 28import datetime
 29from textwrap import TextWrapper
 30
 31CAT_ADJECTIVE = 0
 32CAT_ADVERB = 1
 33CAT_NOUN = 2
 34CAT_VERB = 3
 35
 36category_map = {
 37   'n': CAT_NOUN,
 38   'v': CAT_VERB,
 39   'a': CAT_ADJECTIVE,
 40   's': CAT_ADJECTIVE,
 41   'r': CAT_ADVERB
 42}
 43
 44
 45class WordIndex:
 46   def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
 47      self.lemma = lemma
 48      self.category = category
 49      self.ptrs = ptrs
 50      self.synsets = synsets
 51      self.tagsense_count = tagsense_count
 52   
 53   @classmethod
 54   def build_from_line(cls, line_data, synset_map):
 55      line_split = line_data.split()
 56      lemma = line_split[0]
 57      category = category_map[line_split[1]]
 58      synset_count = int(line_split[2],10)
 59      ptr_count = int(line_split[3],10)
 60      ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
 61      tagsense_count = int(line_split[5 + ptr_count],10)
 62      synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
 63      return cls(lemma, category, ptrs, synsets, tagsense_count)
 64   
 65   @classmethod
 66   def build_from_file(cls, f, synset_map, rv_base=None):
 67      if (rv_base is None):
 68         rv = {}
 69      else:
 70         rv = rv_base
 71         
 72      for line in f:
 73         if (line.startswith('  ')):
 74            continue
 75         wi = cls.build_from_line(line, synset_map)
 76         word = wi.lemma.lower()
 77         if not (word in rv):
 78            rv[word] = []
 79         rv[word].append(wi)
 80      return rv
 81
 82   def __repr__(self):
 83      return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
 84   
 85   
 86class WordIndexDictFormatter(WordIndex):
 87   category_map_rev = {
 88      CAT_NOUN: 'n',
 89      CAT_VERB: 'v',
 90      CAT_ADJECTIVE: 'adj',
 91      CAT_ADVERB: 'adv'
 92   }
 93   linesep = '\n'
 94   LINE_WIDTH_MAX = 68
 95   prefix_fmtf_line_first = '%5s 1: '
 96   prefix_fmtn_line_first = '         '
 97   prefix_fmtf_line_nonfirst = '%5d: '
 98   prefix_fmtn_line_nonfirst = '       '
 99   
100   def dict_str(self):
101      tw = TextWrapper(width=self.LINE_WIDTH_MAX,
102         initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
103         subsequent_indent=self.prefix_fmtn_line_first)
104         
105      lines = (tw.wrap(self.synsets[0].dict_str()))
106      i = 2
107      for synset in self.synsets[1:]:
108         tw = TextWrapper(width=self.LINE_WIDTH_MAX,
109            initial_indent=(self.prefix_fmtf_line_nonfirst % i),
110            subsequent_indent=self.prefix_fmtn_line_nonfirst)
111         lines.extend(tw.wrap(synset.dict_str()))
112         i += 1
113      return self.linesep.join(lines)
114
115
116class Synset:
117   def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
118      self.offset = offset
119      self.type = ss_type
120      self.words = words
121      self.ptrs = ptrs
122      self.gloss = gloss
123      self.frames = frames
124      self.comments = []
125   
126   @classmethod
127   def build_from_line(cls, line_data):
128      line_split = line_data.split()
129      synset_offset = int(line_split[0],10)
130      ss_type = category_map[line_split[2]]
131      word_count = int(line_split[3],16)
132      words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
133      ptr_count = int(line_split[4 + word_count*2],10)
134      ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
135     
136      tok = line_split[5 + word_count*2 + ptr_count*4]
137      base = 6 + word_count*2 + ptr_count*4
138      if (tok != '|'):
139         frame_count = int(tok, 10)
140         frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
141         base += frame_count*3 + 1
142      else:
143         frames = []
144     
145      line_split2 = line_data.split(None, base)
146      if (len(line_split2) < base):
147         gloss = None
148      else:
149         gloss = line_split2[-1]
150     
151      return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
152   
153   @classmethod
154   def build_from_file(cls, f):
155      rv = {}
156      comments = []
157     
158      for line in f:
159         if (line.startswith('  ')):
160            line_s = line.lstrip().rstrip('\n')
161            line_elements = line_s.split(None,1)
162            try:
163               int(line_elements[0])
164            except ValueError:
165               continue
166            if (len(line_elements) == 1):
167               line_elements.append('')
168            comments.append(line_elements[1])
169            continue
170         synset = cls.build_from_line(line.rstrip())
171         rv[synset.offset] = synset
172
173      return (rv, comments)
174
175   def dict_str(self):
176      rv = self.gloss
177      if (len(self.words) > 1):
178         rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
179      return rv
180
181   def __repr__(self):
182      return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
183
184
185class WordnetDict:
186   db_info_fmt = '''This file was converted from the original database on:
187          %(conversion_datetime)s
188
189The original data is available from:
190     %(wn_url)s
191
192The original data was distributed with the notice shown below. No
193additional restrictions are claimed.  Please redistribute this changed
194version under the same conditions and restriction that apply to the
195original version.\n\n
196%(wn_license)s'''
197
198   datetime_fmt = '%Y-%m-%dT%H:%M:%S'
199   base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
200   
201   def __init__(self, wn_url, desc_short, desc_long):
202      self.word_data = {}
203      self.wn_url = wn_url
204      self.desc_short = desc_short
205      self.desc_long = desc_long
206      self.wn_license = None
207   
208   def wn_dict_add(self, file_index, file_data):
209      file_data.seek(0)
210      file_index.seek(0)
211      (synsets, license_lines) = Synset.build_from_file(file_data)
212      WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
213      if (license_lines):
214         self.wn_license = '\n'.join(license_lines) + '\n'
215   
216   @classmethod
217   def base64_encode(cls, i):
218      """Encode a non-negative integer into a dictd compatible base64 string"""
219      if (i < 0):
220         raise ValueError('Value %r for i is negative' % (i,))
221      r = 63
222      e = 1
223      while (r < i):
224         e += 1
225         r = 64**e - 1
226     
227      rv = ''
228      while (e > 0):
229         e -= 1
230         d = (i / 64**e)
231         rv += cls.base64_map[d]
232         i = i % (64**e)
233      return rv
234     
235   @classmethod
236   def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
237      """Write a single dict entry for <key> to index and data files"""
238      entry_start = file_data.tell()
239      file_data.write(entry)
240      entry_len = len(entry)
241      file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
242            cls.base64_encode(entry_len), linesep))
243     
244   def dict_generate(self, file_index, file_data):
245      file_index.seek(0)
246      file_data.seek(0)
247      # The dictd file format is fairly iffy on the subject of special
248      # headwords: either dictd is buggy, or the manpage doesn't tell the whole
249      # story about the format.
250      # The upshot is that order of these entries in the index *matters*.
251      # Putting them at the beginning and in alphabetic order is afaict ok.
252      # Some other orders completely and quietly break the ability to look
253      # those headwords up.
254      # -- problem encountered with 1.10.2, at 2007-08-05.
255      file_data.write('\n')
256      wn_url = self.wn_url
257      conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
258      wn_license = self.wn_license
259      self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
260      self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
261      self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
262      self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
263
264     
265      words = self.word_data.keys()
266      words.sort()
267      for word in words:
268         for wi in self.word_data[word]:
269            word_cs = word
270            # Use case-sensitivity information of first entry of first synset that
271            # matches this word case-insensitively
272            for synset in wi.synsets:
273               for ss_word in synset.words:
274                  if (ss_word.lower() == word_cs.lower()):
275                     word_cs = ss_word
276                     break
277               else:
278                  continue
279               break
280            else:
281               continue
282            break
283           
284         outstr = ''
285         for wi in self.word_data[word]:
286            outstr += wi.dict_str() + '\n'
287         
288         outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
289         self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
290     
291      file_index.truncate()
292      file_data.truncate()
293
294
295if (__name__ == '__main__'):
296   import optparse
297   op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
298   op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
299   op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
300   op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
301   op.add_option('--db_desc_short', dest='desc_short', default='     WordNet (r) 2.1 (2005)', help='short dict DB description')
302   op.add_option('--db_desc_long', dest='desc_long', default='    WordNet (r): A Lexical Database for English from the\n     Cognitive Science Laboratory at Princeton University', help='long dict DB description')
303   
304   (options, args) = op.parse_args()
305   
306   wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
307   
308   for i in range(0,len(args),2):
309      print 'Opening index file %r...' % args[i]
310      file_index = file(args[i])
311      print 'Opening data file %r...' % args[i+1]
312      file_data = file(args[i+1])
313      print 'Parsing index file and data file...'
314      wnd.wn_dict_add(file_index, file_data)
315
316   print 'All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od)
317   
318   wnd.dict_generate(file(options.oi, 'w'),file(options.od, 'w'))
319   print 'All done.'