cpan

����XML::Liberal�� test �Ǥ����ޤ��äƤ����Ȥ�������ι���Ϥޤä���

�桼�����١�����: ���줿 Premiere (�����Խ����ե�) �Υץ��������ȥե����뤬 XML::Liberal (Perl�⥸�塼��) ��ľ�ä�
�Ȥ������Ȥϡ��ʤ�Ȥ����Ʋ��줿�ץ��������ȥե�������well formed�פ� XML �ˤ���Ф�������ʤ��Τ����Ȼפä��櫓�������dz��������Τ���XML::Liberal�פȤ��� miyagawa �ץ������Ȥ� Perl �⥸�塼�롣

��ꤿ���ä��Τϡ�XHTML�Ǥʤ�HTML��XML::*�ʥ⥸�塼��ǰ������ȡ��㤨��XML::LibXML��Ȥ��С�JavaScript�ߤ����ʴ��Ф�DOM�����Ǥ��뤷��XML::Simple��my $title = XMLin($xhtml)->{head}{title}�ߤ����ʤ��Ȥ��ñ�˽���롣��������XML::Liberal������С�XML::*�ʥ⥸�塼���Xȴ����HTML�򿩤äƤ���ʤ�....

�ʤ顢XHTML�ˤ��Ƥ��ޤ��Ф����ǤϤʤ�����

�����ʤ��Ȥˡ�HTML::Tidy�Ȥ����⥸�塼�뤬���Ǥˤ��롣�����HTML-XHTML�Ѵ�������Ф�������ʤ�����

�ǡ����ä�����褿�Τ�������ʤΡ�

/lang/perl/XML-FromHTML/trunk/lib/XML/FromHTML.pm ? CodeRepos::Share ? Trac
package XML::FromHTML;
use warnings;
use strict;
our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g;

use base 'Exporter';
our @EXPORT = qw/html2xml/;

use base 'HTML::Tidy';

sub new {
    my $class = shift;
    bless HTML::Tidy->new(
        {
            @_,
            doctype          => 'omit', # important for speed!
            indent           => 0,
            numeric_entities => 1,
            output_xhtml     => 1,
            tidy_mark        => 0,
            wrap             => 0,
        }
    ), $class;
}

sub html2xml { __PACKAGE__->new->clean(shift) }
1;
% perl -MXML::Simple -MLWP::Simple -MData::Dumper \
    -le 'print Dumper(XMLin(get(shift)))' 
http://example.com/Entity: line 12: parser error : 
  Opening and ending tag mismatch: body line 5 and BODY
% perl -Ilib -MXML::FromHTML -MXML::Simple -MLWP::Simple -MData::Dumper \
    -le 'print Dumper(XMLin(html2xml get(shift)))' http://example.com/
$VAR1 = {
          'body' => {
                    'p' => [
                           'You have reached this web page by typing "example.com", "example.net", or "example.org" into your web browser.',
                           {
                             'a' => {
                                    'href' => 'http://www.rfc-editor.org/rfc/rfc2606.txt',
                                    'content' => 'RFC 2606'
                                  },
                             'content' => [
                                            'These domain names are reserved for use in documentation and are not available for registration. See ',
                                            ', Section 3.'
                                          ]
                           }
                         ]
                  },
          'xmlns' => 'http://www.w3.org/1999/xhtml',
          'head' => {
                    'title' => 'Example Web Page'
                  }
        };

�����ä������ɤ��ꡣ

����ˤϤ���ˤ��ޤ������롣HTML��ľ���������®�ʤΤ���HTML::DOM�Ȥ����⥸�塼�뤬����Τ����������Ȥ���ꡢXHTML�ˤ��Ƥ���XML::LibXML��Ȥä�����®���Τ���

�ʲ���<title>��������Ȥ�ȴ���Ф��٥���ޡ����η�̡���������signature�θ���ˡ��������ˤ��ξ�� regexp �Ǥ�ä��̤��Τ������ᤤ����XML::LibXML������Ū��DOM���Ǥ�����ͥ��Ƥ��뤫��Ǥ��롣

http://example.com/
               Rate   HTML::DOM XML::LibXML      Regexp
HTML::DOM     287/s          --        -88%       -100%
XML::LibXML  2462/s        758%          --        -96%
Regexp      69585/s      24158%       2726%          --
http://blog.livedoor.jp/dankogai/
              Rate   HTML::DOM XML::LibXML      Regexp
HTML::DOM   1.07/s          --        -90%       -100%
XML::LibXML 11.2/s        946%          --       -100%
Regexp      2276/s     212139%      20189%          --

�褯�ͤ��Ƥߤ�С�Liberal��XML�Ȥ��ư�������ɮƬ��HTML�ǤϤʤ���������ʴ�ñ�ˤǤ���Ȥϡ�

Dan the (HTML|XML|Perl) Monger

use strict;
use warnings;
use Benchmark qw/cmpthese timethese/;
# use Data::Dumper;
use HTML::DOM;
use HTML::Entities;
use HTML::Tidy;
use HTTP::Response::Encoding;
use LWP::UserAgent;
use XML::FromHTML;
use XML::LibXML;

my $uri     = shift     || die;
my $res = LWP::UserAgent->new->get($uri);
die $res->status_line unless $res->is_success;

my $content = $res->decoded_content;

my $xml = XML::LibXML->new;
my $dom = HTML::DOM->new();

sub title_regexp{
    my $str = shift;
    $str =~ m{<title>((?>[^<]+))}msi;
    decode_entities($1);
}

sub title_xml {
    my $xhtml = html2xml(shift);
    my $node = $xml->parse_string($xhtml);
    $node->getElementsByTagName('title')->shift->firstChild->toString;
}
my $title = title_regexp($content);
warn qq("$title");

sub title_html {
    $dom->open();
    $dom->write(shift);
    $dom->close();
    $dom->getElementsByTagName('title')->[0]->innerHTML;
}

cmpthese(timethese(0,
        {
            'XML::LibXML' => sub { $title eq title_xml($content)    or die },
            'HTML::DOM'   => sub { $title eq title_html($content)   or die },
            Regexp        => sub { $title eq title_regexp($content) or die },
        }
    ));