~ubuntu-branches/ubuntu/quantal/kyotocabinet/quantal

« back to all changes in this revision

Viewing changes to lab/kcdict/kcdictwntotsv

  • Committer: Package Import Robot
  • Author(s): Shawn Landden
  • Date: 2012-06-07 16:12:07 UTC
  • Revision ID: package-import@ubuntu.com-20120607161207-prbj5blqgzzfl8of
Tags: upstream-1.2.76
ImportĀ upstreamĀ versionĀ 1.2.76

Show diffs side-by-side

added added

removed removed

Lines of Context:
 
1
#! /usr/bin/ruby
 
2
# -*- coding: utf-8 -*-
 
3
 
 
4
BASEDIR = ARGV.length > 0 ? ARGV[0] : "."
 
5
DATAFILES =
 
6
  [
 
7
   { :part => "noun", :path => "#{BASEDIR}/data.noun" },
 
8
   { :part => "verb", :path => "#{BASEDIR}/data.verb" },
 
9
   { :part => "adj", :path => "#{BASEDIR}/data.adj" },
 
10
   { :part => "adv", :path => "#{BASEDIR}/data.adv" },
 
11
  ]
 
12
OUTFILE = 'wordnet.tsv'
 
13
 
 
14
if !File::directory?(BASEDIR)
 
15
  printf("%s is not a directory\n", BASEDIR)
 
16
  exit(1)
 
17
end
 
18
 
 
19
seq = 0
 
20
File::open(OUTFILE, "w") do |outfile|
 
21
  DATAFILES.each do |info|
 
22
    part = info[:part]
 
23
    path = info[:path]
 
24
    File::open(path) do |infile|
 
25
      infile.each do |line|
 
26
        line.force_encoding('UTF-8')
 
27
        next if line.start_with?(" ")
 
28
        line = line.strip
 
29
        head = line.sub(/ *\|.*/, "")
 
30
        head = head.sub(/ *\@.*/, "")
 
31
        fields = head.split(" ")
 
32
        next if fields.length < 4
 
33
        pivot = fields[3].hex * 2
 
34
        next if pivot + 4 > fields.length
 
35
        fields = fields[4..3+pivot]
 
36
        faces = []
 
37
        for i in (0...(fields.length))
 
38
          faces.push(fields[i]) if i % 2 == 0
 
39
        end
 
40
        text = line.sub(/.*\| */, "")
 
41
        faces.each do |face|
 
42
          face = face.gsub(/_/, " ")
 
43
          face = face.gsub(/\s+/, " ")
 
44
          key = face.downcase
 
45
          seq += 1
 
46
          printf(outfile, "%s\t%d\t%s\t%s\t%s\n", key, seq, face, part, text)
 
47
          printf("%s: %d records done\n", $0, seq) if seq % 1000 == 0
 
48
        end
 
49
      end
 
50
    end
 
51
  end
 
52
end