yseto · December 11, 2015 01:18
diff --git a/chugoku-np.pl b/chugoku-np.pl
 #!/usr/bin/env perl

 #
 # Copyright 2013 yseto
 # Released under the MIT License - Please reuse change and share
 #

 use strict;
 use warnings;
 use utf8;
 use Web::Scraper;
 use URI;
 use Encode qw/encode_utf8/;
 use DateTime;
 use DateTime::Format::Mail;
 use Text::Xslate;
 use FindBin;
 use Digest::MD5 qw(md5_hex);
 use IO::All;

 my $base = "http://www.chugoku-np.co.jp/News/";

 #Cache Directory
 my $dir = "$FindBin::Bin/cache/";
 mkdir $dir unless ( -d $dir );

 #Cache Expire (sec)
 my $expire = 86400 * 3;

 #Scraper (Main)
 my $entry = scraper {
    process 'a',
      'link'  => '@href',
      'title' => 'text';
    result 'title', 'link';
 };
 my $scraper = scraper {
    process '//div[@class="newsList2 clearfix ofh"]/ul/li', 'items[]' => $entry;
    result 'items';
 };
 my $result = $scraper->scrape( URI->new($base) );

 sub pubdate {
    my $a = shift;
    if ( $a =~ /Tn(\d{4})(\d{2})(\d{2})(\d{4})\.html/ ) {
        my $dt = DateTime->new(
            year      => $1,
            month     => $2,
            day       => $3,
            hour      => 0,
            minute    => 0,
            second    => 0,
            time_zone => 'Asia/Tokyo',
        );
        return $dt;
    }
    DateTime->now();
 }

 sub description {
    my $tn = shift;

    #hash generate.
    my $name = md5_hex($tn);

    #old entries cache delete.
    if ( ( time() - &pubdate($tn)->epoch ) > $expire ) {
        unlink( $dir . $name ) if ( -f $dir . $name );
        return "";
    }

    #if exist cache.
    if ( -f $dir . $name ) {
        my $str < io( $dir . $name );
        return $str;
    }

    #oh...
    sleep 1;
    my $url = URI->new_abs( $tn, $base )->as_string;

    my $d_scraper = scraper {
        process '//div[@class="txt"]', 'content' => 'TEXT';
        result 'content';
    };

    my $content = encode_utf8( $d_scraper->scrape( URI->new($url) ) );

    #cache generate.
    $content > io( $dir . $name );

    #return description.
    $content;
 }

 my $tx = Text::Xslate->new(
    function => {
        p => sub {
            return DateTime::Format::Mail->format_datetime( &pubdate(shift) );
        },
        t => sub { return encode_utf8(shift); },
        g => sub { return URI->new_abs( shift, $base )->as_string; },
        d => \&description,
    },
 );

 my $template = q{<?xml version="1.0" encoding="UTF-8"?>
 <rss version="2.0">
 <channel>
    <title>中国新聞 自家RSS</title>
    <language>ja</language>
    <pubDate>$now</pubDate>
    : for $result -> $data {
    <item>
      <title><: t($data.title) :></title>
      <pubDate><: p($data.link) :></pubDate>
      <guid><: g($data.link) :></guid>
      <description><: d($data.link) :></description>
    </item>
    : }
  </channel>
 </rss>
 };

 print encode_utf8(
    $tx->render_string(
        $template,
        {
            result => $result,
            now    => DateTime::Format::Mail->format_datetime(
                DateTime->now()->set_time_zone('Asia/Tokyo')
            )
        }
    )
 );
	#!/usr/bin/env perl

	#
	# Copyright 2013 yseto
	# Released under the MIT License - Please reuse change and share
	#

	use strict;
	use warnings;
	use utf8;
	use Web::Scraper;
	use URI;
	use Encode qw/encode_utf8/;
	use DateTime;
	use DateTime::Format::Mail;
	use Text::Xslate;
	use FindBin;
	use Digest::MD5 qw(md5_hex);
	use IO::All;

	my $base = "http://www.chugoku-np.co.jp/News/";

	#Cache Directory
	my $dir = "$FindBin::Bin/cache/";
	mkdir $dir unless ( -d $dir );

	#Cache Expire (sec)
	my $expire = 86400 * 3;

	#Scraper (Main)
	my $entry = scraper {
	process 'a',
	'link' => '@href',
	'title' => 'text';
	result 'title', 'link';
	};
	my $scraper = scraper {
	process '//div[@class="newsList2 clearfix ofh"]/ul/li', 'items[]' => $entry;
	result 'items';
	};
	my $result = $scraper->scrape( URI->new($base) );

	sub pubdate {
	my $a = shift;
	if ( $a =~ /Tn(\d{4})(\d{2})(\d{2})(\d{4})\.html/ ) {
	my $dt = DateTime->new(
	year => $1,
	month => $2,
	day => $3,
	hour => 0,
	minute => 0,
	second => 0,
	time_zone => 'Asia/Tokyo',
	);
	return $dt;
	}
	DateTime->now();
	}

	sub description {
	my $tn = shift;

	#hash generate.
	my $name = md5_hex($tn);

	#old entries cache delete.
	if ( ( time() - &pubdate($tn)->epoch ) > $expire ) {
	unlink( $dir . $name ) if ( -f $dir . $name );
	return "";
	}

	#if exist cache.
	if ( -f $dir . $name ) {
	my $str < io( $dir . $name );
	return $str;
	}

	#oh...
	sleep 1;
	my $url = URI->new_abs( $tn, $base )->as_string;

	my $d_scraper = scraper {
	process '//div[@class="txt"]', 'content' => 'TEXT';
	result 'content';
	};

	my $content = encode_utf8( $d_scraper->scrape( URI->new($url) ) );

	#cache generate.
	$content > io( $dir . $name );

	#return description.
	$content;
	}

	my $tx = Text::Xslate->new(
	function => {
	p => sub {
	return DateTime::Format::Mail->format_datetime( &pubdate(shift) );
	},
	t => sub { return encode_utf8(shift); },
	g => sub { return URI->new_abs( shift, $base )->as_string; },
	d => \&description,
	},
	);

	my $template = q{<?xml version="1.0" encoding="UTF-8"?>
	<rss version="2.0">
	<channel>
	<title>中国新聞自家RSS</title>
	<language>ja</language>
	<pubDate>$now</pubDate>
	: for $result -> $data {
	<item>
	<title><: t($data.title) :></title>
	<pubDate><: p($data.link) :></pubDate>
	<guid><: g($data.link) :></guid>
	<description><: d($data.link) :></description>
	</item>
	: }
	</channel>
	</rss>
	};

	print encode_utf8(
	$tx->render_string(
	$template,
	{
	result => $result,
	now => DateTime::Format::Mail->format_datetime(
	DateTime->now()->set_time_zone('Asia/Tokyo')
	)
	}
	)
	);