jwatr
: Tools to Query and Create Web Archive Files Using the Java Web Archive Toolkit
The Java Web Archive Toolkit ('JWAT') https://sbforge.org/display/JWAT/Overview is a library of Java objects and methods which enables reading, writing and validating web archive files.
WIP!!! Reading & writing need some optimization and edge case checking. There's also a chance I'll change the name to warc
but some folks are using that package now and I dinna want to cause pain there yet.
The following functions are implemented:
Reading
read_warc
: Read a WARC file (compressed or uncompressed)warc_stream_in
: Stream in records from a WARC file
Writing
warc_file
: Create a new WARC filewarc_write_warcinfo
: Write a 'warcinfo' record to a WARC Filewarc_write_response
: Write simplehttr::GET
requests or fullhttr
response
objects to a WARC fileclose_warc_file
: Close a WARC file
httr
Wrappers
warc_GET
: WARC-ify an httr::GET requestwarc_POST
: WARC-ify an httr::GET request
Utility
response_list_to_warc_file
: Turns a list of 'httr' 'response' objects into a WARC filepayload_content
: Helper function to convert WARC raw headers+payload into something usefulis_compressed
: Test if a raw vector is gzip compressed
NOTE: To read in typical (~800MB-1GB gzip'd WARC files) you should consider doing the following (in order) in your scripts:
options(java.parameters = "-Xmx2g")
library(rJava)
library(jwatjars)
library(jwatr)
That idiom generally provides enough heap space, but you may need to adjust the heap size if you've got larger payloads.
Alternatively, you can set the same option in your R startup scripts, but that will likely come back to bite you when moving workloads around.
devtools::install_github("hrbrmstr/jwatr")
library(rJava)
library(jwatr)
library(magick)
library(tidyverse)
# current verison
packageVersion("jwatr")
## [1] '0.2.0'
# small, uncompressed WARC file
glimpse(read_warc(system.file("extdata/bbc.warc", package="jwatr")))
## Observations: 1
## Variables: 13
## $ target_uri <chr> "http://news.bbc.co.uk/2/hi/africa/3414345.stm"
## $ ip_address <chr> "212.58.244.61"
## $ warc_content_type <chr> "application/http; msgtype=response"
## $ warc_type <chr> "response"
## $ content_length <dbl> 43428
## $ payload_type <chr> NA
## $ profile <chr> NA
## $ date <dttm> 2014-08-02
## $ http_status_code <dbl> 200
## $ http_protocol_content_type <chr> "text/html"
## $ http_version <chr> "HTTP/1.1"
## $ http_raw_headers <list> [<48, 54, 54, 50, 2f, 31, 2e, 31, 20, 32, 30, 30, 20, 4f, 4b, 0a, 53, 65, 72, 7...
## $ warc_record_id <chr> "<urn:uuid:ffbfb0c0-6456-42b0-af03-3867be6fc09f>"
# larger example
xdf <- read_warc(system.file("extdata/sample.warc.gz", package="jwatr"),
warc_types = "response", include_payload = TRUE)
glimpse(xdf)
## Observations: 299
## Variables: 14
## $ target_uri <chr> "dns:www.archive.org", "http://www.archive.org/robots.txt", "http://www.archive....
## $ ip_address <chr> "68.87.76.178", "207.241.229.39", "207.241.229.39", "207.241.229.39", "207.241.2...
## $ warc_content_type <chr> "text/dns", "application/http; msgtype=response", "application/http; msgtype=res...
## $ warc_type <chr> "response", "response", "response", "response", "response", "response", "respons...
## $ content_length <dbl> 56, 782, 680, 29000, 1963, 1424, 564, 50832, 14473, 66, 260, 16969, 59, 3135, 13...
## $ payload_type <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ profile <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ...
## $ date <dttm> 2008-04-30, 2008-04-30, 2008-04-30, 2008-04-30, 2008-04-30, 2008-04-30, 2008-04...
## $ http_status_code <dbl> NA, 200, 200, 200, 200, 200, 200, 200, 200, NA, 200, 200, NA, 200, 200, 200, 200...
## $ http_protocol_content_type <chr> NA, "text/plain; charset=UTF-8", "text/html; charset=UTF-8", "text/html; charset...
## $ http_version <chr> NA, "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTT...
## $ http_raw_headers <list> [<>, <48, 54, 54, 50, 2f, 31, 2e, 31, 20, 32, 30, 30, 20, 4f, 4b, 0d, 0a, 44, 6...
## $ warc_record_id <chr> "<urn:uuid:ff728363-2d5f-4f5f-b832-9552de1a6037>", "<urn:uuid:e7c9eff8-f5bc-4aeb...
## $ payload <list> [<32, 30, 30, 38, 30, 34, 33, 30, 32, 30, 34, 38, 32, 35, 0a, 77, 77, 77, 2e, 6...
# get the payload content
payload_content(url = xdf$target_uri[279], ctype = xdf$http_protocol_content_type[279],
xdf$http_raw_headers[[279]], xdf$payload[[279]])
## {xml_document}
## <html>
## [1] <head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8">\n<link rel="stylesheet" href="/styles ...
## [2] <body class="Home">\n\n<!--BEGIN HEADER 1-->\n<table style="background-color:white " cellspacing="0" width="100%" ...
# or ingest the raw bits yourself
imgs <- filter(xdf, grepl("(png|gif|jpeg)$", http_protocol_content_type))
imgs
## # A tibble: 55 x 14
## target_uri ip_address warc_content_type warc_type
## <chr> <chr> <chr> <chr>
## 1 http://www.archive.org/images/logoc.jpg 207.241.229.39 application/http; msgtype=response response
## 2 http://www.archive.org/images/go-button-gateway.gif 207.241.229.39 application/http; msgtype=response response
## 3 http://www.archive.org/images/star.png 207.241.229.39 application/http; msgtype=response response
## 4 http://www.archive.org/images/hewlett.jpg 207.241.229.39 application/http; msgtype=response response
## 5 http://www.archive.org/images/alexalogo-archive.gif 207.241.229.39 application/http; msgtype=response response
## 6 http://www.archive.org/images/djvu-download.gif 207.241.229.39 application/http; msgtype=response response
## 7 http://www.archive.org/images/alexa_websearch_logo.gif 207.241.229.39 application/http; msgtype=response response
## 8 http://www.archive.org/images/ta2004_icon.jpg 207.241.229.39 application/http; msgtype=response response
## 9 http://www.archive.org/images/lizardtech.gif 207.241.229.39 application/http; msgtype=response response
## 10 http://www.archive.org/images/LOCLogoSmall.jpg 207.241.229.39 application/http; msgtype=response response
## # ... with 45 more rows, and 10 more variables: content_length <dbl>, payload_type <chr>, profile <chr>, date <dttm>,
## # http_status_code <dbl>, http_protocol_content_type <chr>, http_version <chr>, http_raw_headers <list>,
## # warc_record_id <chr>, payload <list>
image_read(imgs$payload[[1]])
## format width height colorspace filesize
## 1 JPEG 70 56 sRGB 1662
library(jwatr)
library(httr)
library(magick)
library(tidyverse)
tf <- tempfile("test")
wf <- warc_file(tf)
warc_write_response(wf, "https://rud.is/b/")
# store a simple httr::GET request
warc_write_response(wf, GET("https://rud.is/b/"))
warc_write_response(wf, "https://www.rstudio.com/")
warc_write_response(wf, "https://www.r-project.org/")
# all valid content types work, like this PDF
warc_write_response(wf, "http://che.org.il/wp-content/uploads/2016/12/pdf-sample.pdf")
# complex API calls can be made and the results stored in the WARC file as well
# this API call returns a JSON object
POST(
url = "https://data.police.uk/api/crimes-street/all-crime",
query = list( lat = "52.629729", lng = "-1.131592", date = "2017-01")
) -> uk_res
warc_write_response(wf, uk_res)
warc_write_response(wf, "https://journal.r-project.org/RLogo.png")
close_warc_file(wf)
xdf <- read_warc(sprintf("%s.warc.gz", tf), include_payload = TRUE)
glimpse(xdf)
## Observations: 7
## Variables: 14
## $ target_uri <chr> "https://rud.is/b/", "https://rud.is/b/", "https://www.rstudio.com/", "https://w...
## $ ip_address <chr> "2604:a880:800:10::6bc:2001", "2604:a880:800:10::6bc:2001", "104.196.200.5", "13...
## $ warc_content_type <chr> "application/http; msgtype=response", "application/http; msgtype=response", "app...
## $ warc_type <chr> "response", "response", "response", "response", "response", "response", "response"
## $ content_length <dbl> 38591, 38591, 334, 7480, 8207, 511592, 166003
## $ payload_type <chr> "text/html; charset=UTF-8", "text/html; charset=UTF-8", "text/html", "text/html"...
## $ profile <chr> NA, NA, NA, NA, NA, NA, NA
## $ date <dttm> 2017-09-04, 2017-09-04, 2017-09-04, 2017-09-04, 2017-09-04, 2017-09-04, 2017-09-04
## $ http_status_code <dbl> 200, 200, 403, 200, 200, 200, 200
## $ http_protocol_content_type <chr> "text/html; charset=UTF-8", "text/html; charset=UTF-8", "text/html", "text/html"...
## $ http_version <chr> "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1", "HTTP/1.1"
## $ http_raw_headers <list> [<48, 54, 54, 50, 2f, 31, 2e, 31, 20, 32, 30, 30, 20, 4f, 4b, 0d, 0a, 53, 65, 7...
## $ warc_record_id <chr> "<urn:uuid:25980e44-6242-4d41-90b3-1a4b6b40e1f3>", "<urn:uuid:fc1fd6a6-1cb4-498a...
## $ payload <list> [<3c, 21, 64, 6f, 63, 74, 79, 70, 65, 20, 68, 74, 6d, 6c, 3e, 0d, 0a, 0d, 0a, 3...
# decode the WARC stored JSON response from the UK Crimes API
glimpse(jsonlite::fromJSON(rawToChar(xdf[6,]$payload[[1]]), flatten=TRUE))
## Observations: 1,318
## Variables: 13
## $ category <chr> "anti-social-behaviour", "anti-social-behaviour", "anti-social-behaviour", "anti-so...
## $ location_type <chr> "Force", "Force", "Force", "Force", "Force", "Force", "Force", "Force", "Force", "F...
## $ context <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",...
## $ persistent_id <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",...
## $ id <int> 54165853, 54168392, 54164859, 54168391, 54168386, 54168384, 54168381, 54165308, 541...
## $ location_subtype <chr> "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "", "",...
## $ month <chr> "2017-01", "2017-01", "2017-01", "2017-01", "2017-01", "2017-01", "2017-01", "2017-...
## $ location.latitude <chr> "52.634324", "52.627880", "52.636256", "52.636250", "52.620211", "52.627363", "52.6...
## $ location.longitude <chr> "-1.135916", "-1.144730", "-1.125933", "-1.133691", "-1.133638", "-1.126878", "-1.1...
## $ location.street.id <int> 883326, 883453, 883316, 883415, 882352, 883250, 883332, 883356, 883271, 882445, 883...
## $ location.street.name <chr> "On or near St Martins", "On or near Tarragon Road", "On or near Wimbledon Street",...
## $ outcome_status.category <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ outcome_status.date <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
select(xdf, content_length, http_protocol_content_type)
## # A tibble: 7 x 2
## content_length http_protocol_content_type
## <dbl> <chr>
## 1 38591 text/html; charset=UTF-8
## 2 38591 text/html; charset=UTF-8
## 3 334 text/html
## 4 7480 text/html
## 5 8207 application/pdf
## 6 511592 application/json
## 7 166003 image/png
image_read(xdf$payload[[5]])
## format width height colorspace filesize
## 1 PDF 595 842 sRGB 27600
The warc_stream_in()
function provides a pure-R method for stream processing WARC files through the use of an R callback handler. One way of using this is to build a data frame. The following example builds a data frame of WARC response
records. Space is reserved for a 10,000-element list which will get truncated or expanded as necessary:
xdf <- list(10000)
xdf_i <- 0
myfun <- function(headers, payload, ...) {
headers <- setNames(headers, gsub("-", "_", names(headers)))
xdf_i <<- xdf_i + 1
headers$payload <- list(payload)
xdf[xdf_i] <<- list(headers)
}
(n <- warc_stream_in(
system.file("extdata/sample.warc.gz", package="jwatr"),
myfun,
warc_types = "response"
))
## [1] 299
xdf <- bind_rows(xdf)
glimpse(xdf)
## Observations: 299
## Variables: 9
## $ warc_type <chr> "response", "response", "response", "response", "response", "response", "response", "re...
## $ warc_target_uri <chr> "dns:www.archive.org", "http://www.archive.org/robots.txt", "http://www.archive.org/", ...
## $ warc_date <chr> "2008-04-30T20:48:25Z", "2008-04-30T20:48:25Z", "2008-04-30T20:48:26Z", "2008-04-30T20:...
## $ warc_ip_address <chr> "68.87.76.178", "207.241.229.39", "207.241.229.39", "207.241.229.39", "207.241.229.39",...
## $ warc_record_id <chr> "<urn:uuid:ff728363-2d5f-4f5f-b832-9552de1a6037>", "<urn:uuid:e7c9eff8-f5bc-4aeb-b3d2-9...
## $ content_type <chr> "text/dns", "application/http; msgtype=response", "application/http; msgtype=response",...
## $ content_length <chr> "56", "782", "680", "29000", "1963", "1424", "564", "50832", "14473", "66", "260", "169...
## $ payload <list> [<32, 30, 30, 38, 30, 34, 33, 30, 32, 30, 34, 38, 32, 35, 0a, 77, 77, 77, 2e, 61, 72, ...
## $ warc_payload_digest <chr> NA, "sha1:SUCGMUVXDKVB5CS2NL4R4JABNX7K466U", "sha1:2WAXX5NUWNNCS2BDKCO5OVDQBJVNKIVV", "...
count(xdf, content_type)
## # A tibble: 2 x 2
## content_type n
## <chr> <int>
## 1 application/http; msgtype=response 261
## 2 text/dns 38
cat(rawToChar(xdf$payload[[1]]))
## 20080430204825
## www.archive.org. 589 IN A 207.241.229.39
library(jwatr)
library(testthat)
date()
## [1] "Sun Sep 3 23:33:19 2017"
test_dir("tests/")
## testthat results ========================================================================================================
## OK: 1 SKIPPED: 0 FAILED: 0
##
## DONE ===================================================================================================================
Please note that this project is released with a Contributor Code of Conduct. By participating in this project you agree to abide by its terms.