summaryrefslogtreecommitdiff
path: root/tools/wn2rdf.pl
blob: 2fff87d66be1b5d66deed4a525bebebfdc1eded1 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
#! /usr/bin/perl -w

# Wordnet dictionary database converter
#
# Converts the Wordnet prolog data to rockbox dictionary format.
#
# Written by Miika Pekkarinen <slasher@ihme.org>
#
# $Id$

use strict;

# Lookup tables
my %words;
my %descriptions;

sub getcatname {
	my ($id) = @_;
	
	return 'N' if $id == 1;
	return 'V' if $id == 2;
	return 'A' if $id == 3;
	return 'A' if $id == 4;
	return '?';
}

open IN_WORD, "wn_s.pl" or die "Open fail(#1): $!";
open IN_DESC, "wn_g.pl" or die "Open fail(#2): $!";
open OUTPUT, "> dict.preparsed" or die "Open fail(#3): $!";

print "Reading word file...\n";

# Read everything into memory
while (<IN_WORD>) {
	chomp ;
	
	# s(100001740,1,'entity',n,1,11). => 100001740,1,'entity',n,1,11
	s/(^s\()(.*)(\)\.$)/$2/;
	
	my ($seqid, $n1, $word, $n2, $n3, $n4) = split /,/, $_, 6;
	
	# 'entity' => entity
	$word =~ s/(^\')(.*)(\'$)/$2/;
	$word =~ s/\'\'/\'/s;
	
	my $category = substr $seqid, 0, 1;
	
	$words{lc $word}{$seqid} = $category;
}

close IN_WORD;

print "Reading description file...\n";
while (<IN_DESC>) {
	chomp ;
	
	# g(100002056,'(a separate and self-contained entity)').
	# => 100002056,'(a separate and self-contained entity)'
	s/(^g\()(.*)(\)\.$)/$2/;
	
	my ($seqid, $desc) = split /,/, $_, 2;
	
	$desc =~ s/(^\'\()(.*)(\)\'$)/$2/;
	$desc =~ s/\'\'/\'/s;
	
	$descriptions{$seqid} = $desc;
}

close IN_DESC;

print "Sorting and writing output...\n";

# Now sort and find correct descriptions
foreach my $word (sort keys %words) {
	my %categories;
	
	# Find all definitions of the word
	foreach my $id (keys %{$words{$word}}) {
		my $catid = $words{$word}{$id};
		my $description = $descriptions{$id};
		
		if (!defined($description) or $description eq '') {
			print "Error: Failed to link word: $word / ",
			  $words{$word}, "\n";
			exit 1;
		}
		
		push @{$categories{$catid}}, $description;
	}
	
	my $finaldesc;
	
	# 1 = noun
	# 2 = verb
	# 3 = adjective
	# 4 = adverb
	for my $catid (1 .. 4) {
		my $n = 1;
		my $catdesc;
		
		next unless $categories{$catid};
		foreach my $desc ( @{$categories{$catid}} ) {
			$catdesc .= " " if $catdesc;
			$catdesc .= "$n. $desc";
			$n++;
		}
		
		next unless $catdesc;
		$finaldesc .= "\t" if $finaldesc;
		$finaldesc .= getcatname($catid) . ": $catdesc"
	}
	
	die "Internal error" unless $finaldesc;
	
	print OUTPUT "$word\t$finaldesc\n";
}

close OUTPUT;

print "Done, output was successfully written!\n";