Last active
December 11, 2015 01:18
-
-
Save yseto/4522221 to your computer and use it in GitHub Desktop.
中国新聞のRSSを生成するスクリプト あんたぁ、中国新聞読みたいんじゃろう。じゃけどRSSがないけえ、生成するんよ。 記事を取得してRSSの中身に出すようにしました。
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env perl | |
# | |
# Copyright 2013 yseto | |
# Released under the MIT License - Please reuse change and share | |
# | |
use strict; | |
use warnings; | |
use utf8; | |
use Web::Scraper; | |
use URI; | |
use Encode qw/encode_utf8/; | |
use DateTime; | |
use DateTime::Format::Mail; | |
use Text::Xslate; | |
use FindBin; | |
use Digest::MD5 qw(md5_hex); | |
use IO::All; | |
my $base = "http://www.chugoku-np.co.jp/News/"; | |
#Cache Directory | |
my $dir = "$FindBin::Bin/cache/"; | |
mkdir $dir unless ( -d $dir ); | |
#Cache Expire (sec) | |
my $expire = 86400 * 3; | |
#Scraper (Main) | |
my $entry = scraper { | |
process 'a', | |
'link' => '@href', | |
'title' => 'text'; | |
result 'title', 'link'; | |
}; | |
my $scraper = scraper { | |
process '//div[@class="newsList2 clearfix ofh"]/ul/li', 'items[]' => $entry; | |
result 'items'; | |
}; | |
my $result = $scraper->scrape( URI->new($base) ); | |
sub pubdate { | |
my $a = shift; | |
if ( $a =~ /Tn(\d{4})(\d{2})(\d{2})(\d{4})\.html/ ) { | |
my $dt = DateTime->new( | |
year => $1, | |
month => $2, | |
day => $3, | |
hour => 0, | |
minute => 0, | |
second => 0, | |
time_zone => 'Asia/Tokyo', | |
); | |
return $dt; | |
} | |
DateTime->now(); | |
} | |
sub description { | |
my $tn = shift; | |
#hash generate. | |
my $name = md5_hex($tn); | |
#old entries cache delete. | |
if ( ( time() - &pubdate($tn)->epoch ) > $expire ) { | |
unlink( $dir . $name ) if ( -f $dir . $name ); | |
return ""; | |
} | |
#if exist cache. | |
if ( -f $dir . $name ) { | |
my $str < io( $dir . $name ); | |
return $str; | |
} | |
#oh... | |
sleep 1; | |
my $url = URI->new_abs( $tn, $base )->as_string; | |
my $d_scraper = scraper { | |
process '//div[@class="txt"]', 'content' => 'TEXT'; | |
result 'content'; | |
}; | |
my $content = encode_utf8( $d_scraper->scrape( URI->new($url) ) ); | |
#cache generate. | |
$content > io( $dir . $name ); | |
#return description. | |
$content; | |
} | |
my $tx = Text::Xslate->new( | |
function => { | |
p => sub { | |
return DateTime::Format::Mail->format_datetime( &pubdate(shift) ); | |
}, | |
t => sub { return encode_utf8(shift); }, | |
g => sub { return URI->new_abs( shift, $base )->as_string; }, | |
d => \&description, | |
}, | |
); | |
my $template = q{<?xml version="1.0" encoding="UTF-8"?> | |
<rss version="2.0"> | |
<channel> | |
<title>中国新聞 自家RSS</title> | |
<language>ja</language> | |
<pubDate>$now</pubDate> | |
: for $result -> $data { | |
<item> | |
<title><: t($data.title) :></title> | |
<pubDate><: p($data.link) :></pubDate> | |
<guid><: g($data.link) :></guid> | |
<description><: d($data.link) :></description> | |
</item> | |
: } | |
</channel> | |
</rss> | |
}; | |
print encode_utf8( | |
$tx->render_string( | |
$template, | |
{ | |
result => $result, | |
now => DateTime::Format::Mail->format_datetime( | |
DateTime->now()->set_time_zone('Asia/Tokyo') | |
) | |
} | |
) | |
); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment