#!/usr/bin/perl -w -I/root/newsdb

use getMySQLUserPwd;
use RSSLite;
use DBI;
use LWP::UserAgent;
use HTTP::Cookies;
use HTTP::Request;
use Data::Dumper;
use Getopt::Std;

my $version = '0.05';

getopts('v:w:');

$opt_v = 0 unless $opt_v;
if ($opt_v !~ /\d+/) {
  die "Verbose setting must be a digit!\n";
}

my $cookies = HTTP::Cookies->new;

my $ua = LWP::UserAgent->new;
$ua->agent("IL-xml-harvester/$version");
$ua->from("scott\@industrial-linux.org");
$ua->timeout(45);
$ua->cookie_jar($cookies);

my $dbh;
$dbh = DBI->connect(
  "DBI:mysql:database=news",
  scalar(getpwuid($>)),
  getMySQLUserPwd()
) or die $dbh->errstr;

my $site_select = 'select id, site_url, item_url from site';
$site_select   .= " where $opt_w" if $opt_w;
my $sites = $dbh->selectall_arrayref ($site_select) or die $dbh->errstr;

my $insert = $dbh->prepare(
  "insert ignore into item " . 
  "  set id = ?, ts = now(), title = ?, descrip = ?, url = ?, site_id = ?"
) or die $dbh->errstr;

my $getmax = $dbh->prepare(
  "select max(id)+1 from item"
) or die $dbh->errstr;

my $site;
foreach $site (@$sites) {
  my $content;
  my ($site_id, $site_url, $item_url) = @$site;

  print "Retrieving content from $item_url...\n" if $opt_v >= 4;
  my $req = HTTP::Request->new('GET', $item_url);
  my $res = $ua->request($req);

  if ($res->is_success) {
    $content = $res->content;
    if (!$content) {
      print "Empty content from $item_url!\n" if $opt_v >= 2;
      next;
    }

  ## Maybe they left good info in anchor tags
  } elsif ($res->code eq '404') {  
    print "Attempting link extraction...\n" if $opt_v >= 3;
    my $throwaway = $res->content;
    my $hrefs = $throwaway =~ s/<a\s+href=//gsi;

    ## 8 is more than typical navbar, but allows for not-quite-full RSS channel
    if ($hrefs >= 8) { 
      print "Woohoo! Salvaged content by converting HTML to RSS for $item_url\n"
        if $opt_v >= 2;
      $content = LinksToRDF($item_url, $res->content);
    } else {
      print "GET failed with error ", $res->code, 
            " for $item_url,\n"
        if $opt_v >= 1;
      next;
    }

  ## Maybe they just need a 'www.' in front
  } elsif ($res->code eq '500') {  
    print "Attempting prefix www...\n" if $opt_v >= 3;
    $item_url =~ m|http://(.*?)/(.*)|;
    my ($host, $path) = ($1, $2);
    if ($host !~ /^www./) {
      $item_url = "http://www.$host/$path";
      $req = HTTP::Request->new('GET', $item_url);
      $res = $ua->request($req);
      if ($res->is_success and $res->content) {
        print "Woohoo! Salvaged content by adding 'www.' to hostname!\n"
          if $opt_v >= 2;
        $content = $res->content;
      } else {
        print "GET failed for $item_url,\n", $res->as_string, "\n" 
          if $opt_v >= 1;
        next;
      }
    }

  ## Maybe things are just screwed up
  } else {
    print "GET failed for $item_url,\n", $res->as_string, "\n" if $opt_v >= 1;
    next;
  }

  my $type = usableXML(\$content);
  print "  ...XML type $type recognized\n" if $opt_v >= 8;
  if (not $type) {
    print "NOT usable XML content: $item_url\n" if $opt_v >= 2;
    next;
  }
  
  my %channel = ();
  my @items = ();
    
  my %rsl = ();
  RSSLite::parseXML(\%rsl, \$content);

  $channel{'description'} = $rsl{'description'};
  $channel{'title'}       = $rsl{'title'};
  $channel{'link'}        = $rsl{'link'};
  $channel{'img_url'}     = undef;
  if (defined(@{$rsl{'items'}})) {
    @items = @{$rsl{'items'}};
  } else {
    @items = (); # Let it go so error msg can print
  }

  if (not scalar(@items)) {
    print "No items found for $item_url!\n" if $opt_v >= 2;
    next;
  }

  $channel{'link'} =~ s/&amp;/&/gi;

  $dbh->do(
    "update site set descrip = ?, title = ?, img_url = ?, site_url = ?" .
    "  where id = $site_id", undef, 
    notempty($channel{'description'}),
    notempty($channel{'title'}),     
    notempty($channel{'img_url'}),        
    notempty($channel{'link'})    
  ) or die $dbh->errstr;

  my $i;
  foreach $i (@items) {
    $getmax->execute or die $getmax->errstr;
    my ($item_id) = $getmax->fetchrow_array;
    if ($getmax->err) {
      die $getmax->errstr;
    }
    $item_id = 1 unless $item_id;

    print "  ...insert link ", substr($i->{'link'},0,60), "\n" if $opt_v >= 8;
    $insert->execute(
      $item_id, 
      notempty($i->{'title'}), 
      notempty($i->{'description'}), 
      notempty($i->{'link'}), 
      $site_id
    );
    if ($insert->err and $insert->errstr !~ /duplicate/i) {
      die $insert->errstr;
    }
  }

}

undef $insert;
undef $getmax;
$dbh->disconnect;

print "Done!" if $opt_v;

exit 0;

sub notempty {
  my ($s) = @_;
  if ((not defined($s)) or not $s) {
    return '-';
  } else {
    $s = trim($s);
    $s = '-' if not $s;
    return $s;
  }
}

sub trim {
  my $s = shift;

  $s =~ s/^\s*(.*?)\s*$/$1/;
  return $s;
}

sub LinksToRDF {
  my $item_url = shift;
  my $content = shift;

  my $newcontent = <<EOC;
<rdf:RDF>
  <channel>
    <title>$item_url</title>
    <link>$item_url</title>
    <description>Auto-built from HTML links</description>
  </channel>
EOC

  my $link;
  my @links = ($content =~ m|(<a\s+href=.*?</a>)|gsi);
  foreach $link (@links) {
    $link =~ m|<a\s+href="(.*?)".*?>(.*?)</a>|i;
    my ($href, $title) = ($1, $2);
    next if substr($title,0,1) eq '<';
    $newcontent .= <<EOC;
<item>
  <title>$title</title>
  <link>$href</link>
</item>
EOC
  }

  $newcontent .= "</rdf:RDF>";

  return $newcontent;
}