use strict;
use warnings;
use 5.014;
use utf8;
use Encode;
use Irssi;
use POSIX ();
our $VERSION = "1.6";
our %IRSSI = (
authors => 'David Leadbeater',
contact => '[email protected]',
name => 'urlinfo',
description => 'Print short summaries about URLs from known services that are mentioned on IRC. (Including YouTube, etc.)',
license => 'WTFPL ',
url => 'http://dgl.cx/irssi',
);
# This is needed so it can still run standalone for testing.
BEGIN {
Irssi->import(20140628) if __PACKAGE__ =~ /Irssi/; # This needs Irssi 0.8.17
}
BEGIN {
eval {
require HTML::TreeBuilder;
require URI;
} or do {
print "\x{3}8You need to install HTML::TreeBuilder and URI";
if (-f "/etc/debian_version") {
print "Try running: \x{3}9sudo apt-get install libhtml-treebuilder-perl libwww-perl";
}
die $@;
}
}
# ------- Settings
# /SET urlinfo_title_unknown ON|OFF
# Show title of all unknown sites. (There is a timeout to prevent against
# obvious resource exhaustion attacks, but remember this script has no
# warranty.)
#
# /SET urlinfo_timeout 10
# How many seconds after which to give up trying to fetch a URL
#
# /SET urlinfo_ignore_domains example\.org example\.com
# Space separated list of regular expressions of domains to ignore
#
# /SET urlinfo_ignore_targets freenode #something efnet/#example
# Space separated list of targets to ignore.
#
# /SET urlinfo_send_channels freenode #something efnet/#example
# Space separated list of targets to post in the channel.
#
# /SET urlinfo_custom_domains my\.domain/thing irssi\.org=description
# A limited way of configuring custom domains, if you need something more
# complex edit SITES below.
# Format: domain[/path[=from]]
# ------- Sites configuration
# This script aims to be data driven, this hash has "site_name => {site details}"
# site details is a hash reference which can contain:
# cleanup: A regexp of text to remove from the resulting string
# domain: A string (or regexp, with qr//) of the domain to match (www. is
# removed automatically).
# from: Where to read the info "title", "description" (meta description) or
# a regexp to match the content (default "title")
# example: Example of this URL
# expected: What the example should return (see end for testing)
# items: An array ref of additional hashes to allow multiple of these values
# e.g.: items => [ { domain => "example.com" } ]
# path: Path component (string or regexp)
#
my %SITES = (
vimeo => {
cleanup => qr/\s*on Vimeo$/,
domain => "vimeo.com",
path => qr{/\d+},
example => "http://vimeo.com/80871338",
expected => "Journey Part 1",
},
youtube => {
cleanup => qr/\s*-\s*YouTube$/,
items => [
{
domain => "youtu.be",
example => "http://youtu.be/ghGoI7xVtSI",
expected => "Rick Astley - Never Gonna Give You Up (Live 1987)",
},
{
domain => "youtube.com",
path => "/watch",
example => "https://www.youtube.com/watch?v=ghGoI7xVtSI",
expected => "Rick Astley - Never Gonna Give You Up (Live 1987)",
},
],
},
metacpan => {
cleanup => qr/\s*-\s*metacpan\.org$/,
domain => "metacpan.org",
path => qr{^/pod/},
example => "https://metacpan.org/pod/release/DOY/Reply-0.34/lib/Reply.pm",
from => "description",
expected => "read, eval, print, loop, yay!",
},
pypi => {
domain => "pypi.python.org",
path => qr{^/pypi/},
from => "description",
example => "https://pypi.python.org/pypi/stanford-corenlp-python/3.3.6-0",
expected => "A Stanford Core NLP wrapper (wordseer fork)",
},
gist => {
cleanup => qr/\s*-\s*Gist is .*$/,
domain => "gist.github.com",
from => ["og:title", "description"],
example => "https://gist.github.com/dgl/792206",
expected => "An install script that installs a development version of perl (from ".
"git) and keeps a particular set of modules installed. Sort of ".
"perlbrew for blead, but not quite.",
},
github => {
domain => "github.com",
items => [
{ # issue, commit or pull
cleanup => qr/\s*·.*$/,
path => qr{^/[^/]+/[^/]+/(?:issues|commit|pull)/[a-f0-9]},
example => "https://github.com/irssi/irssi/commit/669add",
expected => "FS#155 hilight -tag",
},
{ # user or project
from => "og:description",
path => qr{^/[^/]+(?:/[^/]+)?$},
example => "https://github.com/irssi/irssi",
expected => "irssi - The client of the future",
},
],
},
);
# ------- Site handling
sub expand {
my @expanded_sites;
for my $site(keys %SITES) {
expand_site(\@expanded_sites, $site, $SITES{$site}, {});
}
return @expanded_sites;
}
# This essentially implements inheritance (via the "items" key), to reduce
# duplication.
sub expand_site {
my($expanded_sites, $site, $site_data, $current) = @_;
delete $current->{items};
my $s = {
name => $site,
from => "title",
%$current,
%$site_data,
};
if (exists $s->{items}) {
expand_site($expanded_sites, $site, $_, $s) for @{$s->{items}};
} else {
push @$expanded_sites, $s;
}
}
sub _matcher {
my($site, $item) = @_;
return 1 unless defined $site;
return 1 if ref $site eq 'ARRAY' && grep _matcher($_, $item), @$site;
return 1 if ref $site && $site->isa("Regexp") && $item =~ $site;
return $site eq $item;
}
sub get_site {
my($sites, $url) = @_;
my $uri = URI->new($url);
$uri = URI->new("http://$url") unless $uri and $uri->scheme;
return unless $uri and $uri->can("host") and $uri->host and $uri->scheme =~ /^https?$/;
for my $site(@$sites) {
my $match = 1;
$match &&= _matcher($site->{domain}, $uri->host =~ s/^www\.//ri);
$match &&= _matcher($site->{$_}, $uri->$_) for qw(scheme host path query fragment);
return $site, $uri if $match;
}
if (Irssi::settings_get_bool("urlinfo_title_unknown")) {
return { name => "unknown", from => "title" }, $uri;
}
return;
}
my %from = (
title => sub {
$_[0]->look_down(_tag => 'title')->as_trimmed_text;
},
description => sub {
my $el = $_[0]->look_down(_tag => 'meta', name => 'description');
$el && $el->attr('content');
},
'og:description' => sub {
my $el = $_[0]->look_down(_tag => 'meta', property => 'og:description');
$el && $el->attr('content');
},
'og:title' => sub {
my $el = $_[0]->look_down(_tag => 'meta', property => 'og:title');
$el && $el->attr('content');
},
);
sub get_info {
my($site, $uri) = @_;
my $from = $site->{from};
my $tree = HTML::TreeBuilder->new_from_url($uri);
my $info;
if (!ref $from || ref $from eq 'ARRAY') {
$info = join ": ", grep defined, map $from{$_}->($tree), ref $from ? @$from : $from;
} else {
$info = join "", $tree->as_html =~ $from;
}
$info =~ s/$site->{cleanup}// if $site->{cleanup};
$info =~ s/([\x00-\x19])/sprintf "\\x%x", ord $1/ger;
}
# ------- IRC message handling
# John Gruber's URL regexp (nicely handles people putting URLs in parens, etc)
my $URL_RE = qr{((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»ââââ]))};
my $pipe_in_progress;
my @sites;
my $timeout = 10;
sub msg {
my($server, $text, $nick, undef, $target) = @_;
# TODO: Add a queue / multiple pipe support?
return if $pipe_in_progress;
my $msg_time = time;
my $tag = $server->{tag};
$target = $target || $nick;
$text = Irssi::strip_codes($text);
if (my($url) = $text =~ $URL_RE) {
my($site, $uri) = get_site(\@sites, $url);
return unless $site;
return if ignored($uri, $server, $target);
fork_wrapper(sub { # Child
my($fh) = @_;
syswrite $fh, " " . encode_utf8(get_info($site, $uri));
},
sub { # Parent
my $in = decode_utf8($_[0]);
if ($in =~ s/^- //) {
print "\x{3}4urlinfo error:\x{3} $in";
return;
}
$in =~ s/^ //;
return unless $in;
# Avoid reusing server just in case it is no longer valid
my $server = Irssi::server_find_tag($tag);
my $win = find_window($server, $target);
my $view = $win->view;
my $line = $view->get_lines;
while ($line && ($line = $line->next)) {
if ($line->{info}->{time} >= $msg_time) {
if ($line->get_text(0) =~ /\Q$url/) {
last;
}
}
}
my $timestamp = POSIX::strftime(
Irssi::settings_get_str("timestamp_format"), localtime $msg_time);
# I'm sure I shouldn't have to care about colours here...
my $pad = length Irssi::strip_codes($timestamp);
if (not(send2channel($server,$target,$url,$in))) {
my $text = $win->format_get_text(__PACKAGE__, $server, $target,
"urlinfo", " " x $pad, $in);
$win->print_after($line, MSGLEVEL_NO_ACT|MSGLEVEL_CLIENTCRAP,
$text, $msg_time);
$view->redraw;
}
});
}
}
sub send2channel {
my ($server,$target,$url,$in) =@_;
my @cl= split(" ",Irssi::settings_get_str('urlinfo_send_channels'));
my $s=0;
foreach ( @cl) {
if ( $_ eq $target || $_ eq $server->{tag}."/".$target) {
$s=1;
$server->command("msg $target urlinfo: $url -> $in");
last;
}
}
return $s;
}
sub ignored {
my($uri, $server, $target) = @_;
my @ignored_domains = split / /, Irssi::settings_get_str('urlinfo_ignore_domains');
my $domain = $uri->host =~ s/^www\.//r;
return 1 if grep $domain =~ /^$_$/, @ignored_domains;
my $chans = $server->isupport("chantypes") || '#&';
my $chan_match = qr/^[$chans]/;
for my $ignored_target (split / /, Irssi::settings_get_str('urlinfo_ignore_targets')) {
my($mtag, $mtarget) = split m{/}, $ignored_target;
if ($mtag =~ $chan_match) {
$mtarget = $mtag;
$mtag = "*";
}
return 1 if _match($mtag, $server->{tag}) &&
(!$mtarget || _match($mtarget, $target));
}
return 0;
}
sub _match {
my($pattern, $name) = @_;
$pattern =~ s/\*/.*/g;
$name =~ /^$pattern$/i;
}
sub find_window {
my($server, $target) = @_;
if (my $witem = $server->window_item_find($target)) {
return $witem->window;
} else {
# Maybe they have a msgs window?
my $win = Irssi::window_find_name("(msgs)");
# Ultimate fallback
$win = Irssi::window_find_refnum(1) unless $win;
return $win;
}
}
# Based on scriptassist.
sub fork_wrapper {
my($child, $parent) = @_;
pipe(my $rfh, my $wfh);
my $pid = fork;
$pipe_in_progress = 1;
return unless defined $pid;
if ($pid) {
close $wfh;
Irssi::pidwait_add($pid);
my $pipetag;
my @args = ($rfh, \$pipetag, $parent);
$pipetag = Irssi::input_add(fileno($rfh), Irssi::INPUT_READ, \&pipe_input, \@args);
} else {
eval {
local $SIG{ALRM} = sub { die "Timed out\n" };
alarm $timeout;
$child->($wfh);
};
alarm 0;
syswrite $wfh, encode_utf8("- $@") if $@;
POSIX::_exit(1);
}
}
sub pipe_input {
my ($rfh, $pipetag, $parent) = @{$_[0]};
my $line = <$rfh>;
close($rfh);
Irssi::input_remove($$pipetag);
$pipe_in_progress = 0;
$parent->($line);
}
sub setup_changed {
$timeout = Irssi::settings_get_int("urlinfo_timeout");
@sites = expand();
for my $site (split / /, Irssi::settings_get_str("urlinfo_custom_domains")) {
next unless $site;
my($re, $from) = split /=/, $site;
$from ||= "title";
my($domain, $path) = split m{/}, $re, 2;
expand_site(\@sites, "custom", {
domain => qr/^$domain$/,
path => defined $path ? qr/^\/$path/ : undef,
from => $from,
}, {});
}
}
# ------- Initialization
if (caller) {
# Irssi specific initialization
require Irssi::TextUI;
Irssi::settings_add_str($IRSSI{name}, "urlinfo_custom_domains", "");
Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_domains", "");
Irssi::settings_add_str($IRSSI{name}, "urlinfo_ignore_targets", "");
Irssi::settings_add_str($IRSSI{name}, "urlinfo_send_channels", "");
Irssi::settings_add_int($IRSSI{name}, "urlinfo_timeout", $timeout);
Irssi::settings_add_bool($IRSSI{name}, "urlinfo_title_unknown", 0);
Irssi::signal_add("message irc action" => \&msg);
Irssi::signal_add("message private" => \&msg);
Irssi::signal_add("message public" => \&msg);
Irssi::signal_add_last("setup changed", \&setup_changed);
setup_changed();
Irssi::theme_register([
'urlinfo' => '$0 %Kinfo:%n $1',
]);
} else {
# Built in test. Run this script outside Irssi to use.
@sites = expand();
for my $site(@sites) {
next unless $site->{example};
my($found_site, $uri) = get_site(\@sites, $site->{example});
if ($found_site != $site) {
die "Got $found_site->{name}, expected $site->{name}";
}
say "Get $uri";
my $result = get_info($site, $uri);
say $result;
die "Got $result, expected $site->{expected}" unless $result eq $site->{expected};
}
say "OK";
}