From de4031dd70a80e0b645d6e456510e309bacd4c92 Mon Sep 17 00:00:00 2001 From: Roland Hieber Date: Sat, 11 Mar 2017 05:45:44 +0100 Subject: [PATCH 1/2] refactor HTTP parsing We don't need the headers at all afterwards, but when parsing XML, they are in the way. The "found no matches" error is no longer needed in the future. --- Org.pm | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/Org.pm b/Org.pm index 8f699a7..f5227b6 100644 --- a/Org.pm +++ b/Org.pm @@ -197,32 +197,36 @@ Accept-Language: en_US, en\r\n); # # parse dict.leo.org output # - my @line = <$conn>; - close $conn or die "Connection failed: $!\n"; - $this->debug( "connection: done"); - - $site = join "", @line; - - if ($site !~ /HTTP\/1\.(0|1) 200 OK/i) { - if ($site =~ /HTTP\/1\.(0|1) (\d+) /i) { - # got HTTP error - my $err = $2; - if ($err == 407) { - croak "proxy auth required or access denied!\n"; - } - else { - if ($site =~ /Leider konnten wir zu Ihrem Suchbegriff/ || - $site =~ /found no matches for your search/ - ) { + $site = ""; + my $got_headers = 0; + while (<$conn>) { + if ($got_headers) { + $site .= $_; + } + elsif (/^\r?$/) { + $got_headers = 1; + } + elsif ($_ !~ /HTTP\/1\.(0|1) 200 OK/i) { + if (/HTTP\/1\.(0|1) (\d+) /i) { + # got HTTP error + my $err = $2; + if ($err == 407) { + croak "proxy auth required or access denied!\n"; + close $conn; return (); } else { croak "got HTTP error $err!\n"; + close $conn; + return (); } } } } + close $conn or die "Connection failed: $!\n"; + $this->debug( "connection: done"); + my @request = ( { id => 2, -- 2.1.2 From 89ac6cbcd6e5b92f5c845b9f91439cab06a16e2e Mon Sep 17 00:00:00 2001 From: Roland Hieber Date: Sat, 11 Mar 2017 06:10:24 +0100 Subject: [PATCH 2/2] implement new XML API, using XML::Simple Unfortunately, we cannot yet parse the additional hints (plural forms, cases, parts of speech, etc., everything inside tags) because the XML::Simple API does not retain the correct ordering of CDATA content mixed with subtags... :-/ This also removes the options -SpellTolerance, -Morphology, and -CharTolerance, which are no longer supported by the new API (as long as I can see.) Note that although the XML contains UTF-8 data, XML::Simple decodes it to latin1, so we have to re-encode it to get good results. --- Makefile.PL | 4 +- Org.pm | 163 ++++++++++++++++++++---------------------------------------- 2 files changed, 57 insertions(+), 110 deletions(-) diff --git a/Makefile.PL b/Makefile.PL index 3d5d4aa..4d621f9 100644 --- a/Makefile.PL +++ b/Makefile.PL @@ -12,7 +12,7 @@ WriteMakefile( 'PREREQ_PM' => { 'Carp::Heavy' => 0, 'IO::Socket' => 0, 'MIME::Base64' => 0, - 'HTML::TableParser' => 0 - } + 'XML::Simple' => 0 + } ); diff --git a/Org.pm b/Org.pm index f5227b6..7f591c5 100644 --- a/Org.pm +++ b/Org.pm @@ -16,7 +16,8 @@ use Carp::Heavy; use Carp; use IO::Socket; use MIME::Base64; -use HTML::TableParser; +use XML::Simple; +use Encode; sub debug; @@ -25,7 +26,7 @@ sub new { my $type = ref( $class ) || $class; my %settings = ( - "-Host" => "pda.leo.org", + "-Host" => "dict.leo.org", "-Port" => 80, "-UserAgent" => "Mozilla/5.0 (Windows NT 6.3; rv:36.0) Gecko/20100101 Firefox/36.0", "-Proxy" => "", @@ -94,23 +95,8 @@ sub translate { } } - # - # cut invalid values for parameters or set defaults if unspecified - # - my %form = ( - spellToler => { mask => [ qw(standard on off) ], val => $this->{"-SpellTolerance"} || "standard" }, - deStem => { mask => [ qw(standard none forcedAll) ], val => $this->{"-Morphology"} || "standard" }, - cmpType => { mask => [ qw(fuzzy exact relaxed) ], val => $this->{"-CharTolerance"} || "relaxed" }, - searchLoc => { mask => [ qw(-1 0 1) ], val => $this->{"-Language"} || "0" }, - ); - my @form; - foreach my $var (keys %form) { - if (grep { $form{$var}->{val} eq $_ } @{$form{$var}->{mask}}) { - push @form, $var . "=" . $form{$var}->{val}; - } - } - # add language + my @form; push @form, "lp=$lang{speak}"; # @@ -151,7 +137,7 @@ sub translate { } my($host, $pport) = split /:/, $proxy; if ($pport) { - $url = "http://$ip:$port/dictQuery/m-vocab/$lang{speak}/de.html"; + $url = "http://$ip:$port/dictQuery/m-vocab/$lang{speak}/query.xml"; $port = $pport; } else { @@ -162,7 +148,7 @@ sub translate { } else { $this->debug( "connecting to site:", $ip, "port", $port); - $url = "/dictQuery/m-vocab/$lang{speak}/de.html"; + $url = "/dictQuery/m-vocab/$lang{speak}/query.xml"; } my $conn = new IO::Socket::INET( @@ -227,79 +213,64 @@ Accept-Language: en_US, en\r\n); close $conn or die "Connection failed: $!\n"; $this->debug( "connection: done"); - my @request = ( - { - id => 2, - row => sub { $this->row(@_); }, - hdr => sub { $this->hdr(@_); } - }, - { - id => 3, - hdr => sub { $this->hdr(@_); }, - row => sub { $this->row(@_); } - }, - { - id => 4, - hdr => sub { $this->hdr(@_); }, - row => sub { $this->row(@_); } - } - ); $this->{Linecount} = 0; - my $p = HTML::TableParser->new( \@request, - { Decode => 1, Trim => 1, Chomp => 1, DecodeNBSP => 1 } ); - $site=~s/ /\ \;/g; - $p->parse($site); - - # put the rest on the stack, if any - if (@{$this->{section}}) { - $this->{data}->{ $this->{title} } = $this->{section}; - push @{$this->{segments}}, $this->{title}; + $this->{Maxsize} = 0; + + # parse the XML + my $xml = new XML::Simple; + my $data = $xml->XMLin($site, + ForceArray => [ 'section', 'entry' ], + ForceContent => 1, + KeyAttr => { side => 'lang' } + ); + + my (@matches, $from_lang, $to_lang); + $from_lang = substr $lang{speak}, 0, 2; + $to_lang = substr $lang{speak}, 2, 2; + + # parse all the s and build a string + sub parse_word($) { + my $word = shift; + if (ref $word eq "HASH") { + if ($word->{content}) { + return encode('UTF-8', $word->{content}); + } + elsif ($word->{cc}) { + # chinese simplified, traditional and pinyin + return encode('UTF-8', $word->{cc}->{cs}->{content} . "[" . + $word->{cc}->{ct}->{content} . "] " . + $word->{cc}->{pa}->{content}); + } + } + elsif (ref $word eq "ARRAY") { + return encode('UTF-8', @{$word}[-1]->{content}); + } + else { + return encode('UTF-8', $word); + } } - # put back in order - my @matches; - foreach my $title (@{$this->{segments}}) { - push @matches, { title => $title, data => $this->{data}->{$title} }; - } - return @matches; -} + foreach my $section (@{$data->{sectionlist}->{section}}) { + my @entries; + foreach my $entry (@{$section->{entry}}) { + my $left = parse_word $entry->{side}->{$from_lang}->{words}->{word}; + my $right = parse_word $entry->{side}->{$to_lang}->{words}->{word}; -sub hdr { - # HTML::TableParser header callback - my ( $this, $tbl_id, $line_no, $data, $udata ) = @_; - if ($data->[1] && $data->[0] eq $data->[1]) { - $this->debug("Probable start of a new section: $data->[1]"); - if (@{$this->{section}}) { - $this->{data}->{ $this->{title} } = $this->{section}; - push @{$this->{segments}}, $this->{title}; + push @entries, { left => $left, right => $right }; + if ($this->{Maxsize} < length($left)) { + $this->{Maxsize} = length($left); + } + $this->{Linecount}++; } - - $this->{title} = $data->[1]; - $this->{section} = []; + push @matches, { + title => encode('UTF-8', $section->{sctTitle}), + data => \@entries + }; } -} -sub row { - # HTML::TableParser data row callback - # - # divide rows into titles and lang data. - # we get 2 items (left and right column), if they - # are equal, it's a segment title, otherwise it's - # segment content. left columns ending in HH:MM - # are forumposts and ignored as well as rows with - # empty left cells. - my ( $this, $tbl_id, $line_no, $data, $udata ) = @_; - my $len = length($data->[0]); - if ($data->[1] && $data->[0] && $data->[0] ne $data->[1] && $data->[0] !~ /\d{2}:\d{2}$/) { - if ($len > $this->{Maxsize}) { - $this->{Maxsize} = $len; - } - $this->debug("line: $line_no, left: $data->[0], right: $data->[1]"); - push @{$this->{section}}, { left => $data->[0], right => $data->[1] }; - $this->{Linecount}++; - } + return @matches; } sub grapheme_length { @@ -403,30 +374,6 @@ Parameters to control behavior of dict.leo.org: =over -=item I<-SpellTolerance> - -Be tolerant to spelling errors. - -Default: turned on. - -Possible values: on, off. - -=item I<-Morphology> - -Provide morphology information. - -Default: standard. - -Possible values: standard, none, forcedAll. - -=item I<-CharTolerance> - -Allow umlaut alternatives. - -Default: relaxed. - -Possible values: fuzzy, exact, relaxed. - =item I<-Language> Translation direction. Please note that dict.leo.org always translates -- 2.1.2