
����XML::Liberal�� test �Ǥ����ޤ��äƤ����Ȥ�������ι���Ϥޤä���

�桼�����١�����: ���줿 Premiere (�����Խ����ե�) �Υץ��������ȥե����뤬 XML::Liberal (Perl�⥸�塼��) ��ľ�ä�
�Ȥ������Ȥϡ��ʤ�Ȥ����Ʋ��줿�ץ��������ȥե�������well formed�פ� XML �ˤ���Ф�������ʤ��Τ����Ȼפä��櫓�������dz��������Τ���XML::Liberal�פȤ��� miyagawa �ץ������Ȥ� Perl �⥸�塼�롣

��ꤿ���ä��Τϡ�XHTML�Ǥʤ�HTML��XML::*�ʥ⥸�塼��ǰ������ȡ��㤨��XML::LibXML��Ȥ��С�JavaScript�ߤ����ʴ��Ф�DOM�����Ǥ��뤷��XML::Simple��my $title = XMLin($xhtml)->{head}{title}�ߤ����ʤ��Ȥ��ñ�˽���롣��������XML::Liberal������С�XML::*�ʥ⥸�塼���Xȴ����HTML�򿩤äƤ���ʤ�....




/lang/perl/XML-FromHTML/trunk/lib/XML/ ? CodeRepos::Share ? Trac
package XML::FromHTML;
use warnings;
use strict;
our $VERSION = sprintf "%d.%02d", q$Revision: 0.2 $ =~ /(\d+)/g;

use base 'Exporter';
our @EXPORT = qw/html2xml/;

use base 'HTML::Tidy';

sub new {
    my $class = shift;
    bless HTML::Tidy->new(
            doctype          => 'omit', # important for speed!
            indent           => 0,
            numeric_entities => 1,
            output_xhtml     => 1,
            tidy_mark        => 0,
            wrap             => 0,
    ), $class;

sub html2xml { __PACKAGE__->new->clean(shift) }
% perl -MXML::Simple -MLWP::Simple -MData::Dumper \
    -le 'print Dumper(XMLin(get(shift)))' line 12: parser error : 
  Opening and ending tag mismatch: body line 5 and BODY
% perl -Ilib -MXML::FromHTML -MXML::Simple -MLWP::Simple -MData::Dumper \
    -le 'print Dumper(XMLin(html2xml get(shift)))'
$VAR1 = {
          'body' => {
                    'p' => [
                           'You have reached this web page by typing "", "", or "" into your web browser.',
                             'a' => {
                                    'href' => '',
                                    'content' => 'RFC 2606'
                             'content' => [
                                            'These domain names are reserved for use in documentation and are not available for registration. See ',
                                            ', Section 3.'
          'xmlns' => '',
          'head' => {
                    'title' => 'Example Web Page'



�ʲ���<title>��������Ȥ�ȴ���Ф��٥���ޡ����η�̡���������signature�θ���ˡ��������ˤ��ξ�� regexp �Ǥ�ä��̤��Τ������ᤤ����XML::LibXML������Ū��DOM���Ǥ�����ͥ��Ƥ��뤫��Ǥ��롣
               Rate   HTML::DOM XML::LibXML      Regexp
HTML::DOM     287/s          --        -88%       -100%
XML::LibXML  2462/s        758%          --        -96%
Regexp      69585/s      24158%       2726%          --
              Rate   HTML::DOM XML::LibXML      Regexp
HTML::DOM   1.07/s          --        -90%       -100%
XML::LibXML 11.2/s        946%          --       -100%
Regexp      2276/s     212139%      20189%          --


Dan the (HTML|XML|Perl) Monger

use strict;
use warnings;
use Benchmark qw/cmpthese timethese/;
# use Data::Dumper;
use HTML::DOM;
use HTML::Entities;
use HTML::Tidy;
use HTTP::Response::Encoding;
use LWP::UserAgent;
use XML::FromHTML;
use XML::LibXML;

my $uri     = shift     || die;
my $res = LWP::UserAgent->new->get($uri);
die $res->status_line unless $res->is_success;

my $content = $res->decoded_content;

my $xml = XML::LibXML->new;
my $dom = HTML::DOM->new();

sub title_regexp{
    my $str = shift;
    $str =~ m{<title>((?>[^<]+))}msi;

sub title_xml {
    my $xhtml = html2xml(shift);
    my $node = $xml->parse_string($xhtml);
my $title = title_regexp($content);
warn qq("$title");

sub title_html {

            'XML::LibXML' => sub { $title eq title_xml($content)    or die },
            'HTML::DOM'   => sub { $title eq title_html($content)   or die },
            Regexp        => sub { $title eq title_regexp($content) or die },