Skip to content

Instantly share code, notes, and snippets.

@valentinitnelav
Created April 8, 2017 19:02
Show Gist options
  • Save valentinitnelav/77159b63a986ff99d9471a4b3038ebc7 to your computer and use it in GitHub Desktop.
Save valentinitnelav/77159b63a986ff99d9471a4b3038ebc7 to your computer and use it in GitHub Desktop.
Read <table> HTML tag with {XML} library
# Read <table> HTML tag with {XML} library
# ====================================================
# Load library
library(XML)
# =======================
# Read the web page [accessed 08-Apr-2017]
# =======================
link <- "http://www.theplantlist.org/1.1/statistics/"
# NOTE: is ethical to store the page and not read it unnecessarily too many times,
# overloading their server
link.scrap <- htmlParse(link)
# =======================
# Read all tables from the page,
# then select desired table
# =======================
tbls.lst <- readHTMLTable(link.scrap)
my.tbl.1 <- tbls.lst[[1]] # select first table
# Note that some unwanted columns were read as well
my.tbl.1
## V1 V2 V3 V4
##1 <U+25D5> Accepted 350,699 33.0%
##2 <U+25D5> Synonym 470,624 44.2%
##3 <U+25D5> Unplaced 243 0.0%
##4 <U+25D5> Unassessed 242,469 22.8%
# Also all data was read as character!
str(my.tbl.1)
##'data.frame': 4 obs. of 4 variables:
## $ V1: Factor w/ 1 level "<U+25D5>""| __truncated__: 1 1 1 1
## $ V2: Factor w/ 4 levels "Accepted","Synonym",..: 1 2 4 3
## $ V3: Factor w/ 4 levels "242,469","243",..: 3 4 2 1
## $ V4: Factor w/ 4 levels "0.0%","22.8%",..: 3 4 1 2
# =======================
# Read specific table(s) from the page
# using directly the table's XPath selector
# =======================
# -----------------------
# Simple reading using XPath selector
# -----------------------
# gives the same results as above
my.tbl.2 <- xpathApply(doc = link.scrap,
path = '//*[@id="columns"]/section/div[1]/table',
fun = readHTMLTable)[[1]]
# -----------------------
# Adjusting for header, column classes & other tweaks
# -----------------------
my.tbl.3 <- xpathApply(doc = link.scrap,
path = '//*[@id="columns"]/section/div[1]/table',
fun = readHTMLTable,
header = c("Status", "Total", "Total_prc"),
colClasses = list(NULL, "character", "FormattedInteger", "Percent"),
stringsAsFactors = FALSE,
skip.rows = 1L,
trim = TRUE)[[1]]
my.tbl.3
## Status Total Total_prc
##1 Accepted 350699 33.0
##2 Synonym 470624 44.2
##3 Unplaced 243 0.0
##4 Unassessed 242469 22.8
# The type of data is as desired now
str(my.tbl.3)
##'data.frame': 4 obs. of 3 variables:
## $ Status : chr "Accepted" "Synonym" "Unplaced" "Unassessed"
## $ Total : int 350699 470624 243 242469
## $ Total_prc: num 33 44.2 0 22.8
# -----------------------
# Select multiple tables via their XPath selectors
# -----------------------
# Use a vector of XPath selectors
tbls.lst.2 <- xpathApply(doc = link.scrap,
path = c('//*[@id="columns"]/section/div[1]/table',
'//*[@id="columns"]/section/table[1]'),
fun = readHTMLTable,
stringsAsFactors = FALSE)
tbls.lst.2
##[[1]]
## V1 V2 V3 V4
##1 <U+25D5> Accepted 350,699 33.0%
##2 <U+25D5> Synonym 470,624 44.2%
##3 <U+25D5> Unplaced 243 0.0%
##4 <U+25D5> Unassessed 242,469 22.8%
##[[2]]
## V1 V2 V3 V4 V5 V6 V7 V8
##1 High confidence 149,349 229,242 0 0 378,591 35.6%
##2 Medium confidence 193,013 214,107 0 0 407,120 38.3%
##3 Low confidence 8,337 27,275 243 242,469 278,324 26.2%
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment