#!/usr/bin/perl
## -*-cperl-*-
## Run simple queries against the Google n-gram database
##
$| = 1;

use warnings;
use strict;

use lib "/net/aistaff/gosse/perl5/lib/perl5/x86_64-linux-gnu-thread-multi" ;
use Time::HiRes qw(time);
use DBI;
use DBD::SQLite;
use CGI qw(:standard *table *div);

use lib ".";
use Twitter_NL_CGI;

## ---- hard-coded configuration options are now in Web1T5_CGI.pm

## ---- user options (set through HTML form)
our $Query = "";        # database query
our $Limit = 100;       # number of results that will be displayed
our $Threshold = 10;   # frequency threshold (database contains n-grams with f >= 10 over word types with f >= 10)
our $Wildcards = "";    # optional values: "group" (for each different filler) or "collapse" (all fillers for each wildcard term)
our $HideFixed = 0;     # whether fixed elements in result set are shown (0) or not (1)
our $Mode = "help";     # script mode: "help", "search" (standard query), "csv" (CSV table), "xml" (Web service)
our $Debug = 0;         # debugging mode displays SQL query

## ---- check "mode" parameter now to suppress HTML output in XML mode (and change error messages)
my $mode_param = param("mode");
if ($mode_param) {
  $mode_param = lc($mode_param);
  unless ($mode_param =~ /^(help|search|xml|csv)$/) {
    print header(-status => "400 Bad Request"), "\n";
    exit 0;
  }
  $Mode = $mode_param;
}
our $html_output = ($Mode eq "xml" or $Mode eq "csv") ? 0 : 1;


## ---- print HTML header and page title
if ($html_output) {
  printHtmlHeader("Twitter_NL_freq.perl");

  print
    h1("Query Form");

  print
    start_form(-method => "GET", -action => "$RootUrl/Twitter_NL_freq.perl"),
    table({-style => "margin: 1em 2em 0em 2em;"},
        Tr(td(b("Search pattern:"), 
              textfield(-name => "query", -value => "", -size => 50, -maxlength => 512),
             ),
           td({-width => 30}, ""),
           td(submit(-name => "mode", -value => "Search"),
              submit(-name => "mode", -value => "CSV"),
              submit(-name => "mode", -value => "XML"),
             ),
          ),
        Tr(td("&bull; display first",
              popup_menu(-name => "limit", -values => [50,100,200,500,1000,10000], -default => 50),
              "N-grams with frequency &ge; ", 
              popup_menu(-name => "threshold", -values => [10,20,40,100,200,500,1000,5000,10000,100000], -default => 40),
             ),
           td(""), 
           td(submit(-name => "mode", -value => "Help"),
              '&nbsp;',
              checkbox(-name => "debug", -value => "on", -checked => 0, -label => "Debug"),
              '&nbsp;',
              checkbox(-name => "optimize", -value => "on", -checked => $Optimize, -label => "Optim."),
             ),
          ),
        Tr(td("&bull; variable elements are",
              popup_menu(-name => "wildcards", -values => ["listed normally", "grouped", "collapsed"], -default => "listed normally"),
              ", constant elements are",
              popup_menu(-name => "fixed", -values => ["shown", "hidden"], -default => "shown"),
             ),
           td(""), 
           td(defaults("Reset Form")),
          ),
         ),
    end_form, "\n\n";
}

## ---- read and validate parameters
$Query = param("query") || $Query;
$Query =~ s/^\s+//; $Query =~ s/\s+$//;
my $limit_param = param("limit");
if ($limit_param) {
  htmlError($Mode, "invalid result set limit '$limit_param'")
    unless $limit_param =~ /^[0-9]+$/ and $limit_param >= 1 and $limit_param <= 10000;
  $Limit = int($limit_param);
}
my $threshold_param = param("threshold");
if ($threshold_param) {
  htmlError($Mode, "invalid frequency threshold '$threshold_param'")
    unless $threshold_param =~ /^[0-9]+$/ and $threshold_param >= 10;
  $Threshold = int($threshold_param);
}
my $wildcards_param = param("wildcards");
if ($wildcards_param) {
  if ($wildcards_param =~ /normal/i)      { $Wildcards = "" }
  elsif ($wildcards_param =~ /group/i)    { $Wildcards = "group" }
  elsif ($wildcards_param =~ /collapse/i) { $Wildcards = "collapse" }
  else { htmlError($Mode, "invalid wildcards behaviour '$wildcards_param' selected") }
}
my $fixed_param = param("fixed");
if ($fixed_param) {
  if ($fixed_param =~ /hidden/i)   { $HideFixed = 1 }
  elsif ($fixed_param =~ /shown/i) { $HideFixed = 0 }
  else { htmlError($Mode, "invalid behaviour '$fixed_param' for constant elements") }
}
my $debug_param = param("debug") || "";
$Debug = 1 if lc($debug_param) eq "on";
my $optimize_param = param("optimize") || ""; # apparently, parameter is simply undefined if the option is not set
$Optimize = (lc(param("optimize")) eq "on") ? 1 : 0;


## ---- temporary "under construction" warning
# htmlError($Mode, "THIS VERSION OF THE INTERFACE IS UNDER CONSTRUCTION - PLEASE COME BACK IN A FEW DAYS");

## ---- HELP page
if ($Mode eq "help") {
  print h1("Instructions: Frequency list"), "\n";
#  htmlError(@INC) ;
  print p(
          "This Web interface allows you to run interactive queries on an
indexed version of the <b>Dutch Twitter corpus</b> (jan 2011-dec 2014, approx 2.6 billion tweets and 28 billion tokens) collected by the University of Groningen.",
"Using a frequency cut-off of 10, the database contains  approx. 6.65M <em>distinct</em> unigrams,
61M bigrams,
135M trigrams,
147M 4-grams,
and 125M 5-grams. 
Note that only n-gram counts are given here. Use the <b>Trends</b> option to see global trends in the data. For viewing daily trends in word frequencies, locations, and the actual tweets, see",
a({-href=>"http://www.let.rug.nl/~kleiweg/bin/dagtwform.py", -target => "blank", -class => "external"}, "Woordfrequenties op Twitter")
, "or", 
a({-href=>"http://145.100.57.91/cgi-bin/twitter", -target => "blank", -class => "external"}, "twiqs.nl"),
"(hosted by the e-science center). The data collection has been described in a number of",

a({-href=>"http://scholar.google.com/scholar?as_q=Twitter&as_epq=&as_oq=&as_eq=&as_occt=any&as_sauthors=Erik+Tjong+Kim+Sang&as_publication=&as_ylo=&as_yhi=&btnG=&hl=nl&as_sdt=0%2C5",
-target => "blank", -class => "external"}, "papers"), "by Erik Tjong Kim Sang.",
"N-grams have been indexed in an SQLite database
with a total size of 35 gigabytes using software developed by",
a({-href=>"http://www.stefan-evert.de", -target => "blank", -class => "external"}, "Stefan Evert"),
          " (",
a({-href=>"http://purl.org/stefan.evert/PUB/Evert2010_WAC6.pdf", -target => "blank", -class => "external"}, "paper"), 
")"
) ;
   print p(
          "If you want to rank matches by their association strength instead, click the", 
i("Associations"), 
"tab at the top of this page.") ;
   print p(
          "For any further questions or bug reports, please contact ",
          a({-href => "http://www.let.rug.nl/gosse", -target => "_blank", -class
=> "external"}, "Gosse Bouma")."."), "\n";
  print h2("Search pattern"), "\n";
  print p("The search pattern consists of up to 5 terms, which represent the elements of an N-gram and must be separated by blanks.",
          "Our database engine supports five different types of search terms:"), "\n"; 
  print ul(li("a", b("literal term"), "matches the specified word form",
              "(e.g.", code("literatuur"), "&rarr;", i("literatuur").")"),
           li("a", b("word set"), "matches any of the listed word forms",
              "(e.g.", code("[huis,huisje]"), "&rarr;", i("huis, huisje").")"),
           li(b("wildcard terms"), "use", b(code("%")), "to stand for an arbitrary substring",
              "(e.g.", code("onmidde%ijk"), "&rarr;",
i("onmiddellijk,onmiddelijk,...").")"),
           li("the asterisk", b(code("*")), "matches an", b("arbitrary word"),
              "(usually the item of interest)"),
           li("a question mark", b(code("?")), "indicates a", b("skipped token").", which will be ignored in the result set"),
          ), "\n";
  print p("Push the", b("Search"), "button to execute your query,", b("Help"), "to display this help page,",
          "or", b("Reset Form"), "to start over from scratch.",
          "The", b("CSV"), "button returns a CSV table suitable for import into a spreadsheet program or database.",
          "The", b("XML"), "button returns the search results in an XML format, allowing this interface to be used as a Web service."), "\n";
  print h2("Options"), "\n";
  print p("You can customise the display format of search results with the option menus below the search pattern:"), "\n";
  print ul(li("select", b("how many N-grams"), "will be displayed (up to 10,000)"),
           li("only show N-grams above a certain", b("frequency threshold"), "(default: 10)"),
           li(b("variable elements"), "in a query (those matching a wildcard term or word set) can be:",
              ul(li(i("listed normally"), "as separate n-grams"),
                 li(i("grouped"), "together, so there is one group for every different word form"),
                 li(i("collapsed"), "by summing over all matching word forms"),
                )),
           li("optionally,", b("constant elements"), "(those matching a literal term, or variable elements that have been collapsed) can be suppressed for more concise output"),
           ), "\n";
  print h2("Examples"), "\n";
  print p("The examples below include comments starting with", code("//").", which must not be entered in the search pattern field."), "\n";
  print pre(<<'STOP'), "\n";
interessante *             // what are people most interested in?

* viool                  // '*' at the start of a query is much slower

sprak ? * [man,vrouw]       // use '?' to skip determiner etc.

 [houd, houdt, houden] van ? * // what do people enjoy? 
                              //notice the space at the start
                              // (use "collapsed" display)

%name ? * geweld       // use with "grouped" display

van * tot *               // a classic of Googleology

anti-establishment ?  // a trick to obtain unigram frequencies
STOP
  print h1(""), "\n";
  print end_html, "\n";
}
## ---- common preparations for SEARCH, XML and CSV operations
else {
  htmlError($Mode, "please enter a search pattern")
    if $Query eq "";
  checkRunningJobs($Mode); # abort if too many request are already being processed
  ## ---- SEARCH operatione
  if ($Mode eq "search") {
    print h1("Results"), "\n" ;
    my $T0 = time;
    my @results = execute_query();
    my $dT = time - $T0;
    print p({-class => "backlink"}, sprintf "%d matches in %.2f seconds", @results+0, $dT), "\n";
    print start_table({-style => "margin: 1em 2em 0em 2em;"}), "\n";
    foreach my $line (@results) {
      my ($f, $s) = @$line;
      if ($s !~ /XXX_END_OF_SENTENCE_XXX/ && $s !~ /XXX_BEGIN_OF_SENTENCE_XXX/ ) {
        if ($f eq "GROUP") {
          print Tr(td({-colspan => 3}, hr)), "\n";
        } else {
          print Tr(td({-align => "right"}, $f),
                 td({-width => 10}, ""),
                 td({-align => "left"}, escapeHTML($s))), "\n";
        }
      }
    }
    print end_table, "\n";
    print h1(""), "\n";
    print end_html, "\n";
  }
  ## ---- XML Web service
  elsif ($Mode eq "xml") {
    my @results = execute_query();
    print header(-type => "application/xml");
    print '<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>', "\n";
    print "<items>\n";
    my $in_group = 0;
    foreach my $line (@results) {
      my ($f, $s) = @$line;
      if ($s !~ /XXX_END_OF_SENTENCE_XXX/ && $s !~ /XXX_BEGIN_OF_SENTENCE_XXX/ ) {
       if ($f eq "GROUP") {
        print "</group>\n"
          if $in_group;
        print "<group>\n";
        $in_group = 1;
       } else {
        print "<item>\n";
        print "\t<hits>$f</hits>\n";
        print "\t<word>",escapeHTML($s),"</word>\n";
        print "</item>\n";
       }
     }
    }
    print "</group>\n"
      if $in_group;
    print "</items>\n";
  }
  ## ---- CSV table
  elsif ($Mode eq "csv") {
    my @results = execute_query();
    print header(-type => "text/comma-separated-values", -attachment => "Web1T5_frequency_list.csv");
    print '"frequency","N-gram"', "\n";
    foreach my $line (@results) {
      my ($freq, $ngram) = @$line;
      if ($ngram !~ /XXX_END_OF_SENTENCE_XXX/ && $ngram !~ /XXX_BEGIN_OF_SENTENCE_XXX/ ) {
        if ($freq eq "GROUP") {
         print '"",""', "\n";
        }
        else {
          printf '%d,"%s"%s', $freq, $ngram, "\n";
        }
      }
    }
  }
}

exit 0;


## ---- SUB execute_query() ... run query against SQLite database (uses global variables)
sub execute_query {
  ## split query into terms and check whether suitable n-gram database is available
  my @Terms = split " ", $Query;
  my $N = @Terms;
  #print p("executing query") ;
  ## n-gram database files (filenames are hard-coded so far)
  htmlError($Mode, "can't find vocabulary database file '$VocabFile' (internal error)")
    unless -f $VocabFile;
  my $NgramFile = sprintf $NgramFilePattern, $N;
  #print p($NgramFile) ;
  htmlError($Mode, "no data available for $N-grams, sorry!",
             "(cannot find the database file '$NgramFile')")
    unless -f $NgramFile;

  ## open SQLite database files
  my $DBH = DBI->connect("dbi:SQLite:dbname=$NgramFile", "", "", { RaiseError => 1, AutoCommit => 1 });
  #print p("opened sql db part 1 ") ;
  $DBH->do("PRAGMA temp_store_directory = ".$DBH->quote($TempDir))
    if $TempDir;
  $DBH->do("PRAGMA synchronous = 0");
  #print p("opened sql db") ;
  my ($res) = $DBH->selectrow_array("SELECT value FROM meta WHERE key = 'n'");
  htmlError($Mode, "'$NgramFile' is not a proper $N-gram database (internal error)")
    unless $res and $res == $N;
  ($res) = $DBH->selectrow_array("PRAGMA page_size");
  html_Error("can't determine page size of $N-gram database file (internal error)")
    unless $res and $res >= 512 and $res <= 32768;
  my $CachePages = int($CacheSize / $res);
  $DBH->do("PRAGMA cache_size = $CachePages");
  my ($Normalize) = $DBH->selectrow_array("SELECT value FROM meta WHERE key = 'normalize'");
  htmlError($Mode, "format error in '$NgramFile' (internal error)")
    unless defined $Normalize;
  $DBH->do("ATTACH ".$DBH->quote($VocabFile)." AS vocabulary");
  ($res) = $DBH->selectrow_array("SELECT value FROM vocabulary.meta WHERE key = 'normalize'");
  htmlError($Mode, "normalization status of '$VocabFile' doesn't match '$NgramFile' (internal error)")
    unless $res == $Normalize;
  #print p("constructing sql") ;
  ## construct SQL expression for n-gram query
  my @QT = map { { -K => $_ } } 1 .. $N; # query terms contain all relevant information
  my %LocalID = ("-1" => "..");          # negative IDs for special locally defined strings
  foreach my $k (1 .. $N) {
    my $idx = $k - 1;             # array subscript for k-th term
    my $term = $Terms[$idx];
    if ($term =~ /^%+$/) {
      htmlError($Mode, "wildcard-only term '$term' is not allowed (use ? or * instead)");
    }
    if ($term eq "?") {
      $QT[$idx]{-type} = "skip";  # ? = ignore this position (local ID = -1 for placeholder "..")
      $QT[$idx]{-var} = "-1";
    }
    else {
      $QT[$idx]{-var} = "w$k";    # all other positions are included in the result
      if ($term eq "*") {
        $QT[$idx]{-type} = "collocate"; # "*" terms represent collocate positions we're interested in
      }
      else {
        $QT[$idx]{-type} = "lexical";   # other terms are constraints to be matched in the query
        my $where_clause = undef;
        my $op = undef;
        if ($term =~ /^\[(.+)\]$/) {
          my @words = grep { s/\s+//; not /^$/ } split /,/, $1; # list of literal word forms, e.g. [mouse,mice]
          @words = map { normalizeString($Mode, $_) } @words
            if $Normalize;
          htmlError($Mode, "wildcard '%' not allowed in word list $term")
            if grep {/\%/} @words;
          $op = "IN";
          $where_clause = "WHERE w IN (". join(", ", map {$DBH->quote($_)} @words) . ")";
        }
        else {
          $term = normalizeString($Mode, $term)
            if $Normalize;
          $op = ($term =~ /\%/) ? "LIKE" : "=";
          $where_clause = "WHERE w $op ".$DBH->quote($term);
        }
        $QT[$idx]{-sql} = "w$k IN (SELECT id FROM vocabulary $where_clause)";
        if ($Optimize) {
          my ($freq) = $DBH->selectrow_array("SELECT SUM(f) FROM vocabulary $where_clause");
          my $cost = $freq || 0;
          $cost /= 1000 # table data are ordered by w1 => assume random access is 1000 x as expensive
            if $k == 1;
          $QT[$idx]{-cost} = $cost; # shuffle constraints so that least frequent term come first
        }
        else {
          $QT[$idx]{-cost} = $k;  # ensures that constraints are kept in original order without --optimize
        }
        if (($op eq "LIKE" or $op eq "IN") and ($Wildcards eq "group")) {
          $QT[$idx]{-order} = "w$k";
        }
        if (($op eq "=") or ($Wildcards eq "collapse")) {
          $LocalID{-($k+1)} = $term; # first query term has local ID -2, etc.
          $QT[$idx]{-var} = "-".($k+1)." AS const$k"; # replace variable by constant (= local ID) in SQL query
        }
      }
    }
  }

  ## collect SQL constraints and re-order them if --optimize has been specified
  my @SQL_constraints =  map {$_->{-sql}} sort {$a->{-cost} <=> $b->{-cost}} grep {defined $_->{-sql}} @QT;
  htmlError($Mode, "you have to specify at least one lexical item in your query!")
    unless @SQL_constraints > 0;

  if ($Optimize) {
    my $have_index_term = 0; # explicitly mark where index should be used (otherwise SQLite might make poor choices withou ANALYZE)
    @SQL_constraints = map {
      if (/^w[1-5]/) {
        if ($have_index_term) {
          "+$_"; # explicitly disallow use of index on any but the first SQL constraint
        }
        else {
          $have_index_term = 1;
          $_;
        }
      }
      else {
        $_
      }
    } @SQL_constraints;
  }

  ## construct full SQL query
  my $columns = join(", ", map { $_->{-var} } @QT);
  my $constraints = join(" AND ", @SQL_constraints);
  our $SQL = "SELECT $columns, SUM(f) AS freq FROM ngrams WHERE $constraints";
  my $group_vars = join(", ", grep {/^w/} map { $_->{-var} } @QT);
  if ($group_vars) {
    $SQL .= " GROUP BY $group_vars";
  }
  else {
    my @const_vars = map { (/(const[0-9])$/) ? $1 : () } map { $_->{-var} } @QT;
    htmlError($Mode, "can't find constant field for dummy GROUP BY clause (internal error)")
      unless @const_vars > 0;
    $SQL .= " GROUP BY $const_vars[0]";
  }
  $SQL .= " HAVING freq >= $Threshold"
    if $Threshold;
  my @order_vars = grep { $_ } map { $_->{-order} } @QT;
  push @order_vars, "freq DESC";
  $SQL .= " ORDER BY ".join(", ", @order_vars)
    if @order_vars;
  $SQL .= " LIMIT $Limit"
    if $Limit;

  ## show SQL query in debug mode
  if ($Debug) {
    print h2("SQL Query"), "\n";
    print p({-style => "margin-left: 2em; margin-right:1em; font-size: 90%;"},
            code(escapeHTML($SQL)));
    print "\n\n";
  }

  ## execute SQL query (returns table of ID values)
  my $id_table = $DBH->selectall_arrayref($SQL);
  my $n_rows = @$id_table;

  ## translate IDs back to strings using vocabulary database (with local memoization)
  my $id2str_query = $DBH->prepare("SELECT w FROM vocabulary WHERE id = ?");
  my %id2str = ();              # local lookup hash for memoization
  my @group_ids = map { -1 } 1 .. $N; # keep track of current group by ID values (--group option)
  my @lines = ();                     # collect output lines to be returned (format: [$freq, $ngram] or ["GROUP", $ngram])
  foreach my $row (@$id_table) {
    my @id = @$row;
    my $f = pop @id;
    my $start_group = 0;
    if ($Wildcards eq "group") {
      ## --group option: check whether variable terms are different from previous item
      foreach my $i (0 .. $N-1) {
        if ($QT[$i]{-order}) {    # only query terms with this attribute are relevant for grouping
          if ($group_ids[$i] != $id[$i]) {
            $start_group++;
            $group_ids[$i] = $id[$i];
          }
        }
      }
    }
    ## translate IDs to strings, using vocabulary database and local IDs
    my @str = ();
    foreach my $id (@id) {
      if ($id < 0) {
        push @str, $LocalID{$id} || "???" # negative IDs are special locally defined strings for constant elements
          unless $HideFixed;
      }
      else {
        my $s = $id2str{$id};
        if (not defined $s) {
          $id2str_query->execute($id);
          ($s) = $id2str_query->fetchrow_array;
          $s = "__ERROR__"
            unless defined $s;
          htmlError($Mode, "multiple entries for vocabulary ID #$id (internal error)")
            if $id2str_query->fetchrow_arrayref;
          $id2str{$id} = $s;
        }
        push @str, $s;
      }
    }
    push @lines, ["GROUP", ""]
      if $start_group;
    push @lines, [$f, "@str"];
  }

  ## disconnect from database
  undef $id2str_query;
  $DBH->disconnect if $DBH;
  undef $DBH;

  ## return collected results to main program
  return @lines;
}