#!/usr/bin/perl ## -*-cperl-*- ## Run simple queries against the Google n-gram database ## $| = 1; use warnings; use strict; use lib "/net/aistaff/gosse/perl5/lib/perl5/x86_64-linux-gnu-thread-multi" ; use Time::HiRes qw(time); use DBI; use DBD::SQLite; use CGI qw(:standard *table *div); use lib "."; use Twitter_NL_CGI; ## ---- hard-coded configuration options are now in Web1T5_CGI.pm ## ---- user options (set through HTML form) our $Query = ""; # database query our $Limit = 100; # number of results that will be displayed our $Threshold = 10; # frequency threshold (database contains n-grams with f >= 10 over word types with f >= 10) our $Wildcards = ""; # optional values: "group" (for each different filler) or "collapse" (all fillers for each wildcard term) our $HideFixed = 0; # whether fixed elements in result set are shown (0) or not (1) our $Mode = "help"; # script mode: "help", "search" (standard query), "csv" (CSV table), "xml" (Web service) our $Debug = 0; # debugging mode displays SQL query ## ---- check "mode" parameter now to suppress HTML output in XML mode (and change error messages) my $mode_param = param("mode"); if ($mode_param) { $mode_param = lc($mode_param); unless ($mode_param =~ /^(help|search|xml|csv)$/) { print header(-status => "400 Bad Request"), "\n"; exit 0; } $Mode = $mode_param; } our $html_output = ($Mode eq "xml" or $Mode eq "csv") ? 0 : 1; ## ---- print HTML header and page title if ($html_output) { printHtmlHeader("Twitter_NL_freq.perl"); print h1("Query Form"); print start_form(-method => "GET", -action => "$RootUrl/Twitter_NL_freq.perl"), table({-style => "margin: 1em 2em 0em 2em;"}, Tr(td(b("Search pattern:"), textfield(-name => "query", -value => "", -size => 50, -maxlength => 512), ), td({-width => 30}, ""), td(submit(-name => "mode", -value => "Search"), submit(-name => "mode", -value => "CSV"), submit(-name => "mode", -value => "XML"), ), ), Tr(td("• display first", popup_menu(-name => "limit", -values => [50,100,200,500,1000,10000], -default => 50), "N-grams with frequency ≥ ", popup_menu(-name => "threshold", -values => [10,20,40,100,200,500,1000,5000,10000,100000], -default => 40), ), td(""), td(submit(-name => "mode", -value => "Help"), ' ', checkbox(-name => "debug", -value => "on", -checked => 0, -label => "Debug"), ' ', checkbox(-name => "optimize", -value => "on", -checked => $Optimize, -label => "Optim."), ), ), Tr(td("• variable elements are", popup_menu(-name => "wildcards", -values => ["listed normally", "grouped", "collapsed"], -default => "listed normally"), ", constant elements are", popup_menu(-name => "fixed", -values => ["shown", "hidden"], -default => "shown"), ), td(""), td(defaults("Reset Form")), ), ), end_form, "\n\n"; } ## ---- read and validate parameters $Query = param("query") || $Query; $Query =~ s/^\s+//; $Query =~ s/\s+$//; my $limit_param = param("limit"); if ($limit_param) { htmlError($Mode, "invalid result set limit '$limit_param'") unless $limit_param =~ /^[0-9]+$/ and $limit_param >= 1 and $limit_param <= 10000; $Limit = int($limit_param); } my $threshold_param = param("threshold"); if ($threshold_param) { htmlError($Mode, "invalid frequency threshold '$threshold_param'") unless $threshold_param =~ /^[0-9]+$/ and $threshold_param >= 10; $Threshold = int($threshold_param); } my $wildcards_param = param("wildcards"); if ($wildcards_param) { if ($wildcards_param =~ /normal/i) { $Wildcards = "" } elsif ($wildcards_param =~ /group/i) { $Wildcards = "group" } elsif ($wildcards_param =~ /collapse/i) { $Wildcards = "collapse" } else { htmlError($Mode, "invalid wildcards behaviour '$wildcards_param' selected") } } my $fixed_param = param("fixed"); if ($fixed_param) { if ($fixed_param =~ /hidden/i) { $HideFixed = 1 } elsif ($fixed_param =~ /shown/i) { $HideFixed = 0 } else { htmlError($Mode, "invalid behaviour '$fixed_param' for constant elements") } } my $debug_param = param("debug") || ""; $Debug = 1 if lc($debug_param) eq "on"; my $optimize_param = param("optimize") || ""; # apparently, parameter is simply undefined if the option is not set $Optimize = (lc(param("optimize")) eq "on") ? 1 : 0; ## ---- temporary "under construction" warning # htmlError($Mode, "THIS VERSION OF THE INTERFACE IS UNDER CONSTRUCTION - PLEASE COME BACK IN A FEW DAYS"); ## ---- HELP page if ($Mode eq "help") { print h1("Instructions: Frequency list"), "\n"; # htmlError(@INC) ; print p( "This Web interface allows you to run interactive queries on an indexed version of the Dutch Twitter corpus (jan 2011-dec 2014, approx 2.6 billion tweets and 28 billion tokens) collected by the University of Groningen.", "Using a frequency cut-off of 10, the database contains approx. 6.65M distinct unigrams, 61M bigrams, 135M trigrams, 147M 4-grams, and 125M 5-grams. Note that only n-gram counts are given here. Use the Trends option to see global trends in the data. For viewing daily trends in word frequencies, locations, and the actual tweets, see", a({-href=>"http://www.let.rug.nl/~kleiweg/bin/dagtwform.py", -target => "blank", -class => "external"}, "Woordfrequenties op Twitter") , "or", a({-href=>"http://145.100.57.91/cgi-bin/twitter", -target => "blank", -class => "external"}, "twiqs.nl"), "(hosted by the e-science center). The data collection has been described in a number of", a({-href=>"http://scholar.google.com/scholar?as_q=Twitter&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=Erik+Tjong+Kim+Sang&as_publication=&as_ylo=&as_yhi=&btnG=&hl=nl&as_sdt=0%2C5", -target => "blank", -class => "external"}, "papers"), "by Erik Tjong Kim Sang.", "N-grams have been indexed in an SQLite database with a total size of 35 gigabytes using software developed by", a({-href=>"http://www.stefan-evert.de", -target => "blank", -class => "external"}, "Stefan Evert"), " (", a({-href=>"http://purl.org/stefan.evert/PUB/Evert2010_WAC6.pdf", -target => "blank", -class => "external"}, "paper"), ")" ) ; print p( "If you want to rank matches by their association strength instead, click the", i("Associations"), "tab at the top of this page.") ; print p( "For any further questions or bug reports, please contact ", a({-href => "http://www.let.rug.nl/gosse", -target => "_blank", -class => "external"}, "Gosse Bouma")."."), "\n"; print h2("Search pattern"), "\n"; print p("The search pattern consists of up to 5 terms, which represent the elements of an N-gram and must be separated by blanks.", "Our database engine supports five different types of search terms:"), "\n"; print ul(li("a", b("literal term"), "matches the specified word form", "(e.g.", code("literatuur"), "→", i("literatuur").")"), li("a", b("word set"), "matches any of the listed word forms", "(e.g.", code("[huis,huisje]"), "→", i("huis, huisje").")"), li(b("wildcard terms"), "use", b(code("%")), "to stand for an arbitrary substring", "(e.g.", code("onmidde%ijk"), "→", i("onmiddellijk,onmiddelijk,...").")"), li("the asterisk", b(code("*")), "matches an", b("arbitrary word"), "(usually the item of interest)"), li("a question mark", b(code("?")), "indicates a", b("skipped token").", which will be ignored in the result set"), ), "\n"; print p("Push the", b("Search"), "button to execute your query,", b("Help"), "to display this help page,", "or", b("Reset Form"), "to start over from scratch.", "The", b("CSV"), "button returns a CSV table suitable for import into a spreadsheet program or database.", "The", b("XML"), "button returns the search results in an XML format, allowing this interface to be used as a Web service."), "\n"; print h2("Options"), "\n"; print p("You can customise the display format of search results with the option menus below the search pattern:"), "\n"; print ul(li("select", b("how many N-grams"), "will be displayed (up to 10,000)"), li("only show N-grams above a certain", b("frequency threshold"), "(default: 10)"), li(b("variable elements"), "in a query (those matching a wildcard term or word set) can be:", ul(li(i("listed normally"), "as separate n-grams"), li(i("grouped"), "together, so there is one group for every different word form"), li(i("collapsed"), "by summing over all matching word forms"), )), li("optionally,", b("constant elements"), "(those matching a literal term, or variable elements that have been collapsed) can be suppressed for more concise output"), ), "\n"; print h2("Examples"), "\n"; print p("The examples below include comments starting with", code("//").", which must not be entered in the search pattern field."), "\n"; print pre(<<'STOP'), "\n"; interessante * // what are people most interested in? * viool // '*' at the start of a query is much slower sprak ? * [man,vrouw] // use '?' to skip determiner etc. [houd, houdt, houden] van ? * // what do people enjoy? //notice the space at the start // (use "collapsed" display) %name ? * geweld // use with "grouped" display van * tot * // a classic of Googleology anti-establishment ? // a trick to obtain unigram frequencies STOP print h1(""), "\n"; print end_html, "\n"; } ## ---- common preparations for SEARCH, XML and CSV operations else { htmlError($Mode, "please enter a search pattern") if $Query eq ""; checkRunningJobs($Mode); # abort if too many request are already being processed ## ---- SEARCH operatione if ($Mode eq "search") { print h1("Results"), "\n" ; my $T0 = time; my @results = execute_query(); my $dT = time - $T0; print p({-class => "backlink"}, sprintf "%d matches in %.2f seconds", @results+0, $dT), "\n"; print start_table({-style => "margin: 1em 2em 0em 2em;"}), "\n"; foreach my $line (@results) { my ($f, $s) = @$line; if ($s !~ /XXX_END_OF_SENTENCE_XXX/ && $s !~ /XXX_BEGIN_OF_SENTENCE_XXX/ ) { if ($f eq "GROUP") { print Tr(td({-colspan => 3}, hr)), "\n"; } else { print Tr(td({-align => "right"}, $f), td({-width => 10}, ""), td({-align => "left"}, escapeHTML($s))), "\n"; } } } print end_table, "\n"; print h1(""), "\n"; print end_html, "\n"; } ## ---- XML Web service elsif ($Mode eq "xml") { my @results = execute_query(); print header(-type => "application/xml"); print '', "\n"; print "\n"; my $in_group = 0; foreach my $line (@results) { my ($f, $s) = @$line; if ($s !~ /XXX_END_OF_SENTENCE_XXX/ && $s !~ /XXX_BEGIN_OF_SENTENCE_XXX/ ) { if ($f eq "GROUP") { print "\n" if $in_group; print "\n"; $in_group = 1; } else { print "\n"; print "\t$f\n"; print "\t",escapeHTML($s),"\n"; print "\n"; } } } print "\n" if $in_group; print "\n"; } ## ---- CSV table elsif ($Mode eq "csv") { my @results = execute_query(); print header(-type => "text/comma-separated-values", -attachment => "Web1T5_frequency_list.csv"); print '"frequency","N-gram"', "\n"; foreach my $line (@results) { my ($freq, $ngram) = @$line; if ($ngram !~ /XXX_END_OF_SENTENCE_XXX/ && $ngram !~ /XXX_BEGIN_OF_SENTENCE_XXX/ ) { if ($freq eq "GROUP") { print '"",""', "\n"; } else { printf '%d,"%s"%s', $freq, $ngram, "\n"; } } } } } exit 0; ## ---- SUB execute_query() ... run query against SQLite database (uses global variables) sub execute_query { ## split query into terms and check whether suitable n-gram database is available my @Terms = split " ", $Query; my $N = @Terms; #print p("executing query") ; ## n-gram database files (filenames are hard-coded so far) htmlError($Mode, "can't find vocabulary database file '$VocabFile' (internal error)") unless -f $VocabFile; my $NgramFile = sprintf $NgramFilePattern, $N; #print p($NgramFile) ; htmlError($Mode, "no data available for $N-grams, sorry!", "(cannot find the database file '$NgramFile')") unless -f $NgramFile; ## open SQLite database files my $DBH = DBI->connect("dbi:SQLite:dbname=$NgramFile", "", "", { RaiseError => 1, AutoCommit => 1 }); #print p("opened sql db part 1 ") ; $DBH->do("PRAGMA temp_store_directory = ".$DBH->quote($TempDir)) if $TempDir; $DBH->do("PRAGMA synchronous = 0"); #print p("opened sql db") ; my ($res) = $DBH->selectrow_array("SELECT value FROM meta WHERE key = 'n'"); htmlError($Mode, "'$NgramFile' is not a proper $N-gram database (internal error)") unless $res and $res == $N; ($res) = $DBH->selectrow_array("PRAGMA page_size"); html_Error("can't determine page size of $N-gram database file (internal error)") unless $res and $res >= 512 and $res <= 32768; my $CachePages = int($CacheSize / $res); $DBH->do("PRAGMA cache_size = $CachePages"); my ($Normalize) = $DBH->selectrow_array("SELECT value FROM meta WHERE key = 'normalize'"); htmlError($Mode, "format error in '$NgramFile' (internal error)") unless defined $Normalize; $DBH->do("ATTACH ".$DBH->quote($VocabFile)." AS vocabulary"); ($res) = $DBH->selectrow_array("SELECT value FROM vocabulary.meta WHERE key = 'normalize'"); htmlError($Mode, "normalization status of '$VocabFile' doesn't match '$NgramFile' (internal error)") unless $res == $Normalize; #print p("constructing sql") ; ## construct SQL expression for n-gram query my @QT = map { { -K => $_ } } 1 .. $N; # query terms contain all relevant information my %LocalID = ("-1" => ".."); # negative IDs for special locally defined strings foreach my $k (1 .. $N) { my $idx = $k - 1; # array subscript for k-th term my $term = $Terms[$idx]; if ($term =~ /^%+$/) { htmlError($Mode, "wildcard-only term '$term' is not allowed (use ? or * instead)"); } if ($term eq "?") { $QT[$idx]{-type} = "skip"; # ? = ignore this position (local ID = -1 for placeholder "..") $QT[$idx]{-var} = "-1"; } else { $QT[$idx]{-var} = "w$k"; # all other positions are included in the result if ($term eq "*") { $QT[$idx]{-type} = "collocate"; # "*" terms represent collocate positions we're interested in } else { $QT[$idx]{-type} = "lexical"; # other terms are constraints to be matched in the query my $where_clause = undef; my $op = undef; if ($term =~ /^\[(.+)\]$/) { my @words = grep { s/\s+//; not /^$/ } split /,/, $1; # list of literal word forms, e.g. [mouse,mice] @words = map { normalizeString($Mode, $_) } @words if $Normalize; htmlError($Mode, "wildcard '%' not allowed in word list $term") if grep {/\%/} @words; $op = "IN"; $where_clause = "WHERE w IN (". join(", ", map {$DBH->quote($_)} @words) . ")"; } else { $term = normalizeString($Mode, $term) if $Normalize; $op = ($term =~ /\%/) ? "LIKE" : "="; $where_clause = "WHERE w $op ".$DBH->quote($term); } $QT[$idx]{-sql} = "w$k IN (SELECT id FROM vocabulary $where_clause)"; if ($Optimize) { my ($freq) = $DBH->selectrow_array("SELECT SUM(f) FROM vocabulary $where_clause"); my $cost = $freq || 0; $cost /= 1000 # table data are ordered by w1 => assume random access is 1000 x as expensive if $k == 1; $QT[$idx]{-cost} = $cost; # shuffle constraints so that least frequent term come first } else { $QT[$idx]{-cost} = $k; # ensures that constraints are kept in original order without --optimize } if (($op eq "LIKE" or $op eq "IN") and ($Wildcards eq "group")) { $QT[$idx]{-order} = "w$k"; } if (($op eq "=") or ($Wildcards eq "collapse")) { $LocalID{-($k+1)} = $term; # first query term has local ID -2, etc. $QT[$idx]{-var} = "-".($k+1)." AS const$k"; # replace variable by constant (= local ID) in SQL query } } } } ## collect SQL constraints and re-order them if --optimize has been specified my @SQL_constraints = map {$_->{-sql}} sort {$a->{-cost} <=> $b->{-cost}} grep {defined $_->{-sql}} @QT; htmlError($Mode, "you have to specify at least one lexical item in your query!") unless @SQL_constraints > 0; if ($Optimize) { my $have_index_term = 0; # explicitly mark where index should be used (otherwise SQLite might make poor choices withou ANALYZE) @SQL_constraints = map { if (/^w[1-5]/) { if ($have_index_term) { "+$_"; # explicitly disallow use of index on any but the first SQL constraint } else { $have_index_term = 1; $_; } } else { $_ } } @SQL_constraints; } ## construct full SQL query my $columns = join(", ", map { $_->{-var} } @QT); my $constraints = join(" AND ", @SQL_constraints); our $SQL = "SELECT $columns, SUM(f) AS freq FROM ngrams WHERE $constraints"; my $group_vars = join(", ", grep {/^w/} map { $_->{-var} } @QT); if ($group_vars) { $SQL .= " GROUP BY $group_vars"; } else { my @const_vars = map { (/(const[0-9])$/) ? $1 : () } map { $_->{-var} } @QT; htmlError($Mode, "can't find constant field for dummy GROUP BY clause (internal error)") unless @const_vars > 0; $SQL .= " GROUP BY $const_vars[0]"; } $SQL .= " HAVING freq >= $Threshold" if $Threshold; my @order_vars = grep { $_ } map { $_->{-order} } @QT; push @order_vars, "freq DESC"; $SQL .= " ORDER BY ".join(", ", @order_vars) if @order_vars; $SQL .= " LIMIT $Limit" if $Limit; ## show SQL query in debug mode if ($Debug) { print h2("SQL Query"), "\n"; print p({-style => "margin-left: 2em; margin-right:1em; font-size: 90%;"}, code(escapeHTML($SQL))); print "\n\n"; } ## execute SQL query (returns table of ID values) my $id_table = $DBH->selectall_arrayref($SQL); my $n_rows = @$id_table; ## translate IDs back to strings using vocabulary database (with local memoization) my $id2str_query = $DBH->prepare("SELECT w FROM vocabulary WHERE id = ?"); my %id2str = (); # local lookup hash for memoization my @group_ids = map { -1 } 1 .. $N; # keep track of current group by ID values (--group option) my @lines = (); # collect output lines to be returned (format: [$freq, $ngram] or ["GROUP", $ngram]) foreach my $row (@$id_table) { my @id = @$row; my $f = pop @id; my $start_group = 0; if ($Wildcards eq "group") { ## --group option: check whether variable terms are different from previous item foreach my $i (0 .. $N-1) { if ($QT[$i]{-order}) { # only query terms with this attribute are relevant for grouping if ($group_ids[$i] != $id[$i]) { $start_group++; $group_ids[$i] = $id[$i]; } } } } ## translate IDs to strings, using vocabulary database and local IDs my @str = (); foreach my $id (@id) { if ($id < 0) { push @str, $LocalID{$id} || "???" # negative IDs are special locally defined strings for constant elements unless $HideFixed; } else { my $s = $id2str{$id}; if (not defined $s) { $id2str_query->execute($id); ($s) = $id2str_query->fetchrow_array; $s = "__ERROR__" unless defined $s; htmlError($Mode, "multiple entries for vocabulary ID #$id (internal error)") if $id2str_query->fetchrow_arrayref; $id2str{$id} = $s; } push @str, $s; } } push @lines, ["GROUP", ""] if $start_group; push @lines, [$f, "@str"]; } ## disconnect from database undef $id2str_query; $DBH->disconnect if $DBH; undef $DBH; ## return collected results to main program return @lines; }