Created
April 8, 2017 19:02
-
-
Save valentinitnelav/77159b63a986ff99d9471a4b3038ebc7 to your computer and use it in GitHub Desktop.
Read <table> HTML tag with {XML} library
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Read <table> HTML tag with {XML} library | |
# ==================================================== | |
# Load library | |
library(XML) | |
# ======================= | |
# Read the web page [accessed 08-Apr-2017] | |
# ======================= | |
link <- "http://www.theplantlist.org/1.1/statistics/" | |
# NOTE: is ethical to store the page and not read it unnecessarily too many times, | |
# overloading their server | |
link.scrap <- htmlParse(link) | |
# ======================= | |
# Read all tables from the page, | |
# then select desired table | |
# ======================= | |
tbls.lst <- readHTMLTable(link.scrap) | |
my.tbl.1 <- tbls.lst[[1]] # select first table | |
# Note that some unwanted columns were read as well | |
my.tbl.1 | |
## V1 V2 V3 V4 | |
##1 <U+25D5> Accepted 350,699 33.0% | |
##2 <U+25D5> Synonym 470,624 44.2% | |
##3 <U+25D5> Unplaced 243 0.0% | |
##4 <U+25D5> Unassessed 242,469 22.8% | |
# Also all data was read as character! | |
str(my.tbl.1) | |
##'data.frame': 4 obs. of 4 variables: | |
## $ V1: Factor w/ 1 level "<U+25D5>""| __truncated__: 1 1 1 1 | |
## $ V2: Factor w/ 4 levels "Accepted","Synonym",..: 1 2 4 3 | |
## $ V3: Factor w/ 4 levels "242,469","243",..: 3 4 2 1 | |
## $ V4: Factor w/ 4 levels "0.0%","22.8%",..: 3 4 1 2 | |
# ======================= | |
# Read specific table(s) from the page | |
# using directly the table's XPath selector | |
# ======================= | |
# ----------------------- | |
# Simple reading using XPath selector | |
# ----------------------- | |
# gives the same results as above | |
my.tbl.2 <- xpathApply(doc = link.scrap, | |
path = '//*[@id="columns"]/section/div[1]/table', | |
fun = readHTMLTable)[[1]] | |
# ----------------------- | |
# Adjusting for header, column classes & other tweaks | |
# ----------------------- | |
my.tbl.3 <- xpathApply(doc = link.scrap, | |
path = '//*[@id="columns"]/section/div[1]/table', | |
fun = readHTMLTable, | |
header = c("Status", "Total", "Total_prc"), | |
colClasses = list(NULL, "character", "FormattedInteger", "Percent"), | |
stringsAsFactors = FALSE, | |
skip.rows = 1L, | |
trim = TRUE)[[1]] | |
my.tbl.3 | |
## Status Total Total_prc | |
##1 Accepted 350699 33.0 | |
##2 Synonym 470624 44.2 | |
##3 Unplaced 243 0.0 | |
##4 Unassessed 242469 22.8 | |
# The type of data is as desired now | |
str(my.tbl.3) | |
##'data.frame': 4 obs. of 3 variables: | |
## $ Status : chr "Accepted" "Synonym" "Unplaced" "Unassessed" | |
## $ Total : int 350699 470624 243 242469 | |
## $ Total_prc: num 33 44.2 0 22.8 | |
# ----------------------- | |
# Select multiple tables via their XPath selectors | |
# ----------------------- | |
# Use a vector of XPath selectors | |
tbls.lst.2 <- xpathApply(doc = link.scrap, | |
path = c('//*[@id="columns"]/section/div[1]/table', | |
'//*[@id="columns"]/section/table[1]'), | |
fun = readHTMLTable, | |
stringsAsFactors = FALSE) | |
tbls.lst.2 | |
##[[1]] | |
## V1 V2 V3 V4 | |
##1 <U+25D5> Accepted 350,699 33.0% | |
##2 <U+25D5> Synonym 470,624 44.2% | |
##3 <U+25D5> Unplaced 243 0.0% | |
##4 <U+25D5> Unassessed 242,469 22.8% | |
##[[2]] | |
## V1 V2 V3 V4 V5 V6 V7 V8 | |
##1 High confidence 149,349 229,242 0 0 378,591 35.6% | |
##2 Medium confidence 193,013 214,107 0 0 407,120 38.3% | |
##3 Low confidence 8,337 27,275 243 242,469 278,324 26.2% |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment