pkgs/servers/dict/wordnet_structures.py at devShellTools-shell · tjh.dev/nixpkgs

tjh.dev / nixpkgs
Clone of https://github.com/NixOS/nixpkgs.git (to stress-test knotserver)
nixpkgs / pkgs / servers / dict / wordnet_structures.py
at devShellTools-shell 12 kB view raw
  1#!/usr/bin/env python3
  2#Copyright 2007 Sebastian Hagen
  3# This file is part of wordnet_tools.
  4
  5# wordnet_tools is free software; you can redistribute it and/or modify
  6# it under the terms of the GNU General Public License version 2
  7# as published by the Free Software Foundation
  8
  9# wordnet_tools is distributed in the hope that it will be useful,
 10# but WITHOUT ANY WARRANTY; without even the implied warranty of
 11# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 12# GNU General Public License for more details.
 13
 14# You should have received a copy of the GNU General Public License
 15# along with wordnet_tools; if not, write to the Free Software
 16# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
 17
 18# This program requires python >= 2.4.
 19
 20# This program converts wordnet index/data file pairs into dict index/data
 21# files usable by dictd.
 22# This is basically a reimplementation of the wnfilter program by Rik Faith,
 23# which unfortunately doesn't work correctly for wordnet files in the newer
 24# formats. This version of wordnet_structures whould parse wordnet 2.1 files
 25# correctly, and create output very similar to what wnfilter would have
 26# written.
 27
 28import datetime
 29import math
 30from textwrap import TextWrapper
 31
 32CAT_ADJECTIVE = 0
 33CAT_ADVERB = 1
 34CAT_NOUN = 2
 35CAT_VERB = 3
 36
 37category_map = {
 38   'n': CAT_NOUN,
 39   'v': CAT_VERB,
 40   'a': CAT_ADJECTIVE,
 41   's': CAT_ADJECTIVE,
 42   'r': CAT_ADVERB
 43}
 44
 45
 46class WordIndex:
 47   def __init__(self, lemma, category, ptrs, synsets, tagsense_count):
 48      self.lemma = lemma
 49      self.category = category
 50      self.ptrs = ptrs
 51      self.synsets = synsets
 52      self.tagsense_count = tagsense_count
 53
 54   @classmethod
 55   def build_from_line(cls, line_data, synset_map):
 56      line_split = line_data.split()
 57      lemma = line_split[0]
 58      category = category_map[line_split[1]]
 59      synset_count = int(line_split[2],10)
 60      ptr_count = int(line_split[3],10)
 61      ptrs = [line_split[i] for i in range(3, 3+ptr_count)]
 62      tagsense_count = int(line_split[5 + ptr_count],10)
 63      synsets = [synset_map[int(line_split[i],10)] for i in range(6 + ptr_count, 6 + ptr_count + synset_count)]
 64      return cls(lemma, category, ptrs, synsets, tagsense_count)
 65
 66   @classmethod
 67   def build_from_file(cls, f, synset_map, rv_base=None):
 68      if (rv_base is None):
 69         rv = {}
 70      else:
 71         rv = rv_base
 72
 73      for line in f:
 74         if (line.startswith('  ')):
 75            continue
 76         wi = cls.build_from_line(line, synset_map)
 77         word = wi.lemma.lower()
 78         if not (word in rv):
 79            rv[word] = []
 80         rv[word].append(wi)
 81      return rv
 82
 83   def __repr__(self):
 84      return '%s%s' % (self.__class__.__name__, (self.lemma, self.category, self.ptrs, self.synsets, self.tagsense_count))
 85
 86
 87class WordIndexDictFormatter(WordIndex):
 88   category_map_rev = {
 89      CAT_NOUN: 'n',
 90      CAT_VERB: 'v',
 91      CAT_ADJECTIVE: 'adj',
 92      CAT_ADVERB: 'adv'
 93   }
 94   linesep = '\n'
 95   LINE_WIDTH_MAX = 68
 96   prefix_fmtf_line_first = '%5s 1: '
 97   prefix_fmtn_line_first = '         '
 98   prefix_fmtf_line_nonfirst = '%5d: '
 99   prefix_fmtn_line_nonfirst = '       '
100
101   def dict_str(self):
102      tw = TextWrapper(width=self.LINE_WIDTH_MAX,
103         initial_indent=(self.prefix_fmtf_line_first % self.category_map_rev[self.category]),
104         subsequent_indent=self.prefix_fmtn_line_first)
105
106      lines = (tw.wrap(self.synsets[0].dict_str()))
107      i = 2
108      for synset in self.synsets[1:]:
109         tw = TextWrapper(width=self.LINE_WIDTH_MAX,
110            initial_indent=(self.prefix_fmtf_line_nonfirst % i),
111            subsequent_indent=self.prefix_fmtn_line_nonfirst)
112         lines.extend(tw.wrap(synset.dict_str()))
113         i += 1
114      return self.linesep.join(lines)
115
116
117class Synset:
118   def __init__(self, offset, ss_type, words, ptrs, gloss, frames=()):
119      self.offset = offset
120      self.type = ss_type
121      self.words = words
122      self.ptrs = ptrs
123      self.gloss = gloss
124      self.frames = frames
125      self.comments = []
126
127   @classmethod
128   def build_from_line(cls, line_data):
129      line_split = line_data.split()
130      synset_offset = int(line_split[0],10)
131      ss_type = category_map[line_split[2]]
132      word_count = int(line_split[3],16)
133      words = [line_split[i] for i in range(4, 4 + word_count*2,2)]
134      ptr_count = int(line_split[4 + word_count*2],10)
135      ptrs = [(line_split[i], line_split[i+1], line_split[i+2], line_split[i+3]) for i in range(5 + word_count*2,4 + word_count*2 + ptr_count*4,4)]
136
137      tok = line_split[5 + word_count*2 + ptr_count*4]
138      base = 6 + word_count*2 + ptr_count*4
139      if (tok != '|'):
140         frame_count = int(tok, 10)
141         frames = [(int(line_split[i+1],10), int(line_split[i+2],16)) for i in range(base, base + frame_count*3, 3)]
142         base += frame_count*3 + 1
143      else:
144         frames = []
145
146      line_split2 = line_data.split(None, base)
147      if (len(line_split2) < base):
148         gloss = None
149      else:
150         gloss = line_split2[-1]
151
152      return cls(synset_offset, ss_type, words, ptrs, gloss, frames)
153
154   @classmethod
155   def build_from_file(cls, f):
156      rv = {}
157      comments = []
158
159      for line in f:
160         if (line.startswith('  ')):
161            line_s = line.lstrip().rstrip('\n')
162            line_elements = line_s.split(None,1)
163            try:
164               int(line_elements[0])
165            except ValueError:
166               continue
167            if (len(line_elements) == 1):
168               line_elements.append('')
169            comments.append(line_elements[1])
170            continue
171         synset = cls.build_from_line(line.rstrip())
172         rv[synset.offset] = synset
173
174      return (rv, comments)
175
176   def dict_str(self):
177      rv = self.gloss
178      if (len(self.words) > 1):
179         rv += ' [syn: %s]' % (', '.join([('{%s}' % word) for word in self.words]))
180      return rv
181
182   def __repr__(self):
183      return '%s%s' % (self.__class__.__name__, (self.offset, self.type, self.words, self.ptrs, self.gloss, self.frames))
184
185
186class WordnetDict:
187   db_info_fmt = '''This file was converted from the original database on:
188          %(conversion_datetime)s
189
190The original data is available from:
191     %(wn_url)s
192
193The original data was distributed with the notice shown below. No
194additional restrictions are claimed.  Please redistribute this changed
195version under the same conditions and restriction that apply to the
196original version.\n\n
197%(wn_license)s'''
198
199   datetime_fmt = '%Y-%m-%dT%H:%M:%S'
200   base64_map = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/'
201
202   def __init__(self, wn_url, desc_short, desc_long):
203      self.word_data = {}
204      self.wn_url = wn_url
205      self.desc_short = desc_short
206      self.desc_long = desc_long
207      self.wn_license = None
208
209   def wn_dict_add(self, file_index, file_data):
210      file_data.seek(0)
211      file_index.seek(0)
212      (synsets, license_lines) = Synset.build_from_file(file_data)
213      WordIndexDictFormatter.build_from_file(file_index, synsets, self.word_data)
214      if (license_lines):
215         self.wn_license = '\n'.join(license_lines) + '\n'
216
217   @classmethod
218   def base64_encode(cls, i):
219      """Encode a non-negative integer into a dictd compatible base64 string"""
220      if (i < 0):
221         raise ValueError('Value %r for i is negative' % (i,))
222      r = 63
223      e = 1
224      while (r < i):
225         e += 1
226         r = 64**e - 1
227
228      rv = ''
229      while (e > 0):
230         e -= 1
231         d = math.floor(i / 64**e)
232         rv += cls.base64_map[d]
233         i = i % (64**e)
234      return rv
235
236   @classmethod
237   def dict_entry_write(cls, file_index, file_data, key, entry, linesep='\n'):
238      """Write a single dict entry for <key> to index and data files"""
239      entry_start = file_data.tell()
240      file_data.write(entry)
241      entry_len = len(entry)
242      file_index.write('%s\t%s\t%s%s' % (key, cls.base64_encode(entry_start),
243            cls.base64_encode(entry_len), linesep))
244
245   def dict_generate(self, file_index, file_data):
246      file_index.seek(0)
247      file_data.seek(0)
248      # The dictd file format is fairly iffy on the subject of special
249      # headwords: either dictd is buggy, or the manpage doesn't tell the whole
250      # story about the format.
251      # The upshot is that order of these entries in the index *matters*.
252      # Putting them at the beginning and in alphabetic order is afaict ok.
253      # Some other orders completely and quietly break the ability to look
254      # those headwords up.
255      # -- problem encountered with 1.10.2, at 2007-08-05.
256      file_data.write('\n')
257      wn_url = self.wn_url
258      conversion_datetime = datetime.datetime.now().strftime(self.datetime_fmt)
259      wn_license = self.wn_license
260      self.dict_entry_write(file_index, file_data, '00-database-info', '00-database-info\n%s\n' % (self.db_info_fmt % vars()))
261      self.dict_entry_write(file_index, file_data, '00-database-long', '00-database-long\n%s\n' % self.desc_long)
262      self.dict_entry_write(file_index, file_data, '00-database-short', '00-database-short\n%s\n' % self.desc_short)
263      self.dict_entry_write(file_index, file_data, '00-database-url', '00-database-url\n%s\n' % self.wn_url)
264
265
266      words = list(self.word_data.keys())
267      words.sort()
268      for word in words:
269         for wi in self.word_data[word]:
270            word_cs = word
271            # Use case-sensitivity information of first entry of first synset that
272            # matches this word case-insensitively
273            for synset in wi.synsets:
274               for ss_word in synset.words:
275                  if (ss_word.lower() == word_cs.lower()):
276                     word_cs = ss_word
277                     break
278               else:
279                  continue
280               break
281            else:
282               continue
283            break
284
285         outstr = ''
286         for wi in self.word_data[word]:
287            outstr += wi.dict_str() + '\n'
288
289         outstr = '%s%s%s' % (word_cs, wi.linesep, outstr)
290         self.dict_entry_write(file_index, file_data, word_cs, outstr, wi.linesep)
291
292      file_index.truncate()
293      file_data.truncate()
294
295
296if (__name__ == '__main__'):
297   import optparse
298   op = optparse.OptionParser(usage='usage: %prog [options] (<wn_index_file> <wn_data_file>)+')
299   op.add_option('-i', '--outindex', dest='oi', default='wn.index', help='filename of index file to write to')
300   op.add_option('-d', '--outdata', dest='od', default='wn.dict', help='filename of data file to write to')
301   op.add_option('--wn_url', dest='wn_url', default='ftp://ftp.cogsci.princeton.edu/pub/wordnet/2.0', help='URL for wordnet sources')
302   op.add_option('--db_desc_short', dest='desc_short', default='     WordNet (r) 2.1 (2005)', help='short dict DB description')
303   op.add_option('--db_desc_long', dest='desc_long', default='    WordNet (r): A Lexical Database for English from the\n     Cognitive Science Laboratory at Princeton University', help='long dict DB description')
304
305   (options, args) = op.parse_args()
306
307   wnd = WordnetDict(wn_url=options.wn_url, desc_short=options.desc_short, desc_long=options.desc_long)
308
309   for i in range(0,len(args),2):
310      print('Opening index file %r...' % args[i])
311      file_index = open(args[i])
312      print('Opening data file %r...' % args[i+1])
313      file_data = open(args[i+1])
314      print('Parsing index file and data file...')
315      wnd.wn_dict_add(file_index, file_data)
316
317   print('All input files parsed. Writing output to index file %r and data file %r.' % (options.oi, options.od))
318
319   wnd.dict_generate(open(options.oi, 'w'),open(options.od, 'w'))
320   print('All done.')