#!/usr/bin/perl # Usage: dt_search [OPTIONS] '//node[@pos="adv" and ../@cat="np"]' cdb/*.xml use Getopt::Std; use strict "vars"; use vars qw($opt_l $opt_s $opt_h $opt_c); my (@words, $filename, $sentence); my $match_total = 0 ; my (%rel, %cat, %pos); my $query ; my @files ; my $help = "Usage: $0 [Options] Query Files Search Files for trees matching Query. Query uses XPath syntax, see http://www.w3.org/TR/xpath Options: -l : print stats for values of rel,cat,pos labels in matching nodes -s : show sentences with matching phrases -c : show matching constituents only, not the full sentence. -h : This message. "; sub do_filename_output { exec "xmlmatch", "-q", $query, @files; die "dt_search: Error: Could not exec xmlmatch"; } # Deze is sneller dan de --show-match versie doordat Perl # aanzienlijk minder data hoeft te verwerken. Dit terwijl de xmlmatch # stap langer duurt dan met --show-match. (!) sub do_stats { my $stylesheet = $ENV{ALPINO_HOME} . "/stylesheets/gen-stat-info.xsl"; my ($name,$value); open(XMLMATCH, "-|") or exec "xmlmatch", "--block-buffered", "-s", $stylesheet, "-q", $query, @files; while () { chomp; if (/^(cat|pos|rel)=(.*)/) { # ($name,$value) = /^(.*?)=(.*)/; ++${$1}{$2}; } } close(XMLMATCH) or die "Error opening xmlmatch $!"; &print_stats; } # Alhoewel xmlmatch zelf sneller klaar is dan met het gebruik van het # gen-stat-info stylesheet, duurt het door het parsen van de # grotere hoeveelheid data toch langer. sub do_stats_slower { my ($content, $attr) = ("", ""); open(XMLMATCH, "-|") or exec "xmlmatch", "--block-buffered", "--show-match", "-q", $query, @files; while (defined (my $line = )) { if ($line =~ /ATTRIBUTE (rel|cat|pos)/) { $attr = $1; } elsif ($line =~ /content=(.*)$/) { $content = $1; if ($attr) { ++${$attr}{$content}; } $attr = ""; } } close(XMLMATCH) or die "Error opening xmlmatch $!"; &print_stats; } # Deze heeft geen grep meer nodig om de sentence eruit te vissen en is # zodoende een stuk sneller. sub do_matching_constituents { my $stylesheet = $ENV{ALPINO_HOME} . "/stylesheets/gen-bracket-info.xsl"; open(XMLMATCH, "-|") or exec "xmlmatch", "--line-buffered", "-s", $stylesheet, "-q", $query, @files; while () { chomp; ## show matching constituents if (/^(filename|sentence|begin|end)/) { my ($name,$value) = /^(.*?)=(.*)/; if ($name =~ /^filename$/) { $filename = $value; } elsif ($name =~ /^sentence$/) { @words = split(' ',$sentence = $value) ; } elsif ($name =~ /^begin$/) { $match_total++ ; $words[$value] =~ s/^/\[/ ; } elsif ($name =~ /^end$/) { $words[$value-1] =~ s/$/\]/ ; } } elsif (/^-$/) { &print_matching_string; } } close(XMLMATCH) or die "Error opening xmlmatch $!"; print "\n$match_total Matching substrings\n" ; } sub print_stats { foreach my $counthash (qw(rel cat pos)) { # we gaan nu %rel, %cat, %pos bijlangs print "$counthash:\n"; # sorteren op frequentie foreach my $label (sort {${$counthash}{$b} <=> ${$counthash}{$a}} keys %{$counthash}) { printf("%8d %s\n", ${$counthash}{$label}, $label); } } } sub print_matching_string { print "$filename\t" ; my $in = 0 ; foreach my $word (@words) { if ($opt_s) { print "$word " ; } elsif ($opt_c) { $in += $word =~ tr/[/[/; if ($in) { print "$word " ; } $in -= $word =~ tr/]/]/; } } print "\n" ; } sub main { if (! $ENV{ALPINO_HOME}) { $ENV{ALPINO_HOME}="./" } unless ($ENV{ALPINO_HOME}) { die "dt_search: Error: Please set your ALPINO_HOME environment variable\n"; } getopts('lshc') ; die $help if ( $opt_h or @ARGV < 2 ); $query = shift(@ARGV) ; @files = @ARGV ; if ($opt_l) { &do_stats; } elsif ($opt_s or $opt_c) { &do_matching_constituents; } else { &do_filename_output; } } ### ENTRY POINT &main;