Last active
December 11, 2015 01:18
-
-
Save yseto/4522221 to your computer and use it in GitHub Desktop.
Revisions
-
yseto revised this gist
Mar 16, 2013 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,7 +10,7 @@ use utf8; use Web::Scraper; use URI; use Encode qw/encode_utf8/; use DateTime; use DateTime::Format::Mail; use Text::Xslate; -
yseto revised this gist
Mar 16, 2013 . 1 changed file with 58 additions and 5 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,21 +1,33 @@ #!/usr/bin/env perl # # Copyright 2013 yseto # Released under the MIT License - Please reuse change and share # use strict; use warnings; use utf8; use Web::Scraper; use URI; use Encode qw/encode_utf8 decode/; use DateTime; use DateTime::Format::Mail; use Text::Xslate; use FindBin; use Digest::MD5 qw(md5_hex); use IO::All; my $base = "http://www.chugoku-np.co.jp/News/"; #Cache Directory my $dir = "$FindBin::Bin/cache/"; mkdir $dir unless ( -d $dir ); #Cache Expire (sec) my $expire = 86400 * 3; #Scraper (Main) my $entry = scraper { process 'a', 'link' => '@href', @@ -40,15 +52,55 @@ sub pubdate { second => 0, time_zone => 'Asia/Tokyo', ); return $dt; } DateTime->now(); } sub description { my $tn = shift; #hash generate. my $name = md5_hex($tn); #old entries cache delete. if ( ( time() - &pubdate($tn)->epoch ) > $expire ) { unlink( $dir . $name ) if ( -f $dir . $name ); return ""; } #if exist cache. if ( -f $dir . $name ) { my $str < io( $dir . $name ); return $str; } #oh... sleep 1; my $url = URI->new_abs( $tn, $base )->as_string; my $d_scraper = scraper { process '//div[@class="txt"]', 'content' => 'TEXT'; result 'content'; }; my $content = encode_utf8( $d_scraper->scrape( URI->new($url) ) ); #cache generate. $content > io( $dir . $name ); #return description. $content; } my $tx = Text::Xslate->new( function => { p => sub { return DateTime::Format::Mail->format_datetime( &pubdate(shift) ); }, t => sub { return encode_utf8(shift); }, g => sub { return URI->new_abs( shift, $base )->as_string; }, d => \&description, }, ); @@ -63,6 +115,7 @@ sub pubdate { <title><: t($data.title) :></title> <pubDate><: p($data.link) :></pubDate> <guid><: g($data.link) :></guid> <description><: d($data.link) :></description> </item> : } </channel> -
yseto revised this gist
Jan 13, 2013 . 1 changed file with 0 additions and 1 deletion.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -10,7 +10,6 @@ use utf8; use Web::Scraper; use URI; use Encode qw/encode_utf8/; use DateTime; use DateTime::Format::Mail; -
yseto revised this gist
Jan 13, 2013 . 1 changed file with 6 additions and 0 deletions.There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -1,4 +1,10 @@ #!/usr/bin/env perl # # Copyright 2013 yseto # Released under the MIT License - Please reuse change and share # use strict; use warnings; use utf8; -
yseto created this gist
Jan 13, 2013 .There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,77 @@ #!/usr/bin/env perl use strict; use warnings; use utf8; use Web::Scraper; use URI; use URI::Fetch; use Encode qw/encode_utf8/; use DateTime; use DateTime::Format::Mail; use Text::Xslate; my $base = "http://www.chugoku-np.co.jp/News/"; my $entry = scraper { process 'a', 'link' => '@href', 'title' => 'text'; result 'title', 'link'; }; my $scraper = scraper { process '//div[@class="newsList2 clearfix ofh"]/ul/li', 'items[]' => $entry; result 'items'; }; my $result = $scraper->scrape( URI->new($base) ); sub pubdate { my $a = shift; if ( $a =~ /Tn(\d{4})(\d{2})(\d{2})(\d{4})\.html/ ) { my $dt = DateTime->new( year => $1, month => $2, day => $3, hour => 0, minute => 0, second => 0, time_zone => 'Asia/Tokyo', ); return DateTime::Format::Mail->format_datetime($dt); } } my $tx = Text::Xslate->new( function => { p => \&pubdate, t => sub { return encode_utf8(shift); }, g => sub { return URI->new_abs( shift, $base )->as_string; }, }, ); my $template = q{<?xml version="1.0" encoding="UTF-8"?> <rss version="2.0"> <channel> <title>中国新聞 自家RSS</title> <language>ja</language> <pubDate>$now</pubDate> : for $result -> $data { <item> <title><: t($data.title) :></title> <pubDate><: p($data.link) :></pubDate> <guid><: g($data.link) :></guid> </item> : } </channel> </rss> }; print encode_utf8( $tx->render_string( $template, { result => $result, now => DateTime::Format::Mail->format_datetime( DateTime->now()->set_time_zone('Asia/Tokyo') ) } ) );