Skip to content

Instantly share code, notes, and snippets.

@yseto
Last active December 11, 2015 01:18
Show Gist options
  • Save yseto/4522221 to your computer and use it in GitHub Desktop.
Save yseto/4522221 to your computer and use it in GitHub Desktop.

Revisions

  1. yseto revised this gist Mar 16, 2013. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion chugoku-np.pl
    Original file line number Diff line number Diff line change
    @@ -10,7 +10,7 @@
    use utf8;
    use Web::Scraper;
    use URI;
    use Encode qw/encode_utf8 decode/;
    use Encode qw/encode_utf8/;
    use DateTime;
    use DateTime::Format::Mail;
    use Text::Xslate;
  2. yseto revised this gist Mar 16, 2013. 1 changed file with 58 additions and 5 deletions.
    63 changes: 58 additions & 5 deletions chugoku-np.pl
    Original file line number Diff line number Diff line change
    @@ -1,21 +1,33 @@
    #!/usr/bin/env perl

    #
    # Copyright 2013 yseto
    # Released under the MIT License - Please reuse change and share
    # Copyright 2013 yseto
    # Released under the MIT License - Please reuse change and share
    #

    use strict;
    use warnings;
    use utf8;
    use Web::Scraper;
    use URI;
    use Encode qw/encode_utf8/;
    use Encode qw/encode_utf8 decode/;
    use DateTime;
    use DateTime::Format::Mail;
    use Text::Xslate;
    use FindBin;
    use Digest::MD5 qw(md5_hex);
    use IO::All;

    my $base = "http://www.chugoku-np.co.jp/News/";

    #Cache Directory
    my $dir = "$FindBin::Bin/cache/";
    mkdir $dir unless ( -d $dir );

    #Cache Expire (sec)
    my $expire = 86400 * 3;

    #Scraper (Main)
    my $entry = scraper {
    process 'a',
    'link' => '@href',
    @@ -40,15 +52,55 @@ sub pubdate {
    second => 0,
    time_zone => 'Asia/Tokyo',
    );
    return DateTime::Format::Mail->format_datetime($dt);
    return $dt;
    }
    DateTime->now();
    }

    sub description {
    my $tn = shift;

    #hash generate.
    my $name = md5_hex($tn);

    #old entries cache delete.
    if ( ( time() - &pubdate($tn)->epoch ) > $expire ) {
    unlink( $dir . $name ) if ( -f $dir . $name );
    return "";
    }

    #if exist cache.
    if ( -f $dir . $name ) {
    my $str < io( $dir . $name );
    return $str;
    }

    #oh...
    sleep 1;
    my $url = URI->new_abs( $tn, $base )->as_string;

    my $d_scraper = scraper {
    process '//div[@class="txt"]', 'content' => 'TEXT';
    result 'content';
    };

    my $content = encode_utf8( $d_scraper->scrape( URI->new($url) ) );

    #cache generate.
    $content > io( $dir . $name );

    #return description.
    $content;
    }

    my $tx = Text::Xslate->new(
    function => {
    p => \&pubdate,
    p => sub {
    return DateTime::Format::Mail->format_datetime( &pubdate(shift) );
    },
    t => sub { return encode_utf8(shift); },
    g => sub { return URI->new_abs( shift, $base )->as_string; },
    d => \&description,
    },
    );

    @@ -63,6 +115,7 @@ sub pubdate {
    <title><: t($data.title) :></title>
    <pubDate><: p($data.link) :></pubDate>
    <guid><: g($data.link) :></guid>
    <description><: d($data.link) :></description>
    </item>
    : }
    </channel>
  3. yseto revised this gist Jan 13, 2013. 1 changed file with 0 additions and 1 deletion.
    1 change: 0 additions & 1 deletion chugoku-np.pl
    Original file line number Diff line number Diff line change
    @@ -10,7 +10,6 @@
    use utf8;
    use Web::Scraper;
    use URI;
    use URI::Fetch;
    use Encode qw/encode_utf8/;
    use DateTime;
    use DateTime::Format::Mail;
  4. yseto revised this gist Jan 13, 2013. 1 changed file with 6 additions and 0 deletions.
    6 changes: 6 additions & 0 deletions chugoku-np.pl
    Original file line number Diff line number Diff line change
    @@ -1,4 +1,10 @@
    #!/usr/bin/env perl

    #
    # Copyright 2013 yseto
    # Released under the MIT License - Please reuse change and share
    #

    use strict;
    use warnings;
    use utf8;
  5. yseto created this gist Jan 13, 2013.
    77 changes: 77 additions & 0 deletions chugoku-np.pl
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,77 @@
    #!/usr/bin/env perl
    use strict;
    use warnings;
    use utf8;
    use Web::Scraper;
    use URI;
    use URI::Fetch;
    use Encode qw/encode_utf8/;
    use DateTime;
    use DateTime::Format::Mail;
    use Text::Xslate;
    my $base = "http://www.chugoku-np.co.jp/News/";

    my $entry = scraper {
    process 'a',
    'link' => '@href',
    'title' => 'text';
    result 'title', 'link';
    };
    my $scraper = scraper {
    process '//div[@class="newsList2 clearfix ofh"]/ul/li', 'items[]' => $entry;
    result 'items';
    };
    my $result = $scraper->scrape( URI->new($base) );

    sub pubdate {
    my $a = shift;
    if ( $a =~ /Tn(\d{4})(\d{2})(\d{2})(\d{4})\.html/ ) {
    my $dt = DateTime->new(
    year => $1,
    month => $2,
    day => $3,
    hour => 0,
    minute => 0,
    second => 0,
    time_zone => 'Asia/Tokyo',
    );
    return DateTime::Format::Mail->format_datetime($dt);
    }
    }

    my $tx = Text::Xslate->new(
    function => {
    p => \&pubdate,
    t => sub { return encode_utf8(shift); },
    g => sub { return URI->new_abs( shift, $base )->as_string; },
    },
    );

    my $template = q{<?xml version="1.0" encoding="UTF-8"?>
    <rss version="2.0">
    <channel>
    <title>中国新聞 自家RSS</title>
    <language>ja</language>
    <pubDate>$now</pubDate>
    : for $result -> $data {
    <item>
    <title><: t($data.title) :></title>
    <pubDate><: p($data.link) :></pubDate>
    <guid><: g($data.link) :></guid>
    </item>
    : }
    </channel>
    </rss>
    };

    print encode_utf8(
    $tx->render_string(
    $template,
    {
    result => $result,
    now => DateTime::Format::Mail->format_datetime(
    DateTime->now()->set_time_zone('Asia/Tokyo')
    )
    }
    )
    );