-
Notifications
You must be signed in to change notification settings - Fork 1
/
reap-table.R
207 lines (177 loc) · 6.21 KB
/
reap-table.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
#' Extract data from HTML tables
#'
#' This behaves differently than [rvest::html_table()]. It does an
#' aggressive fill by default when `colspan`s or `rowspan`s are detected
#' and does not make any attempt to go beyond providing a basic data frame
#' of the HTML table. See `Details` for more information.
#'
#' The functionality provided in [rvest::html_table()] is double-plus good so
#' the intent of this function was not to subvert it. Rather, [reap_table()]
#' was designed to give you more direct R-access to the underlying structure
#' of an HTML table so you can wrangle it as you please. In "raw" mode,
#' you get a list with attributes enabling you to work with the table structure,
#' cell values and entity attributes with R idioms vs XPath queries.
#'
#' @note When passing in a `reapr_doc` object, the pre-parsed HTML will be
#' tested for validity and re-generated if the external pointer is
#' invalid.
#' @param x a `reapr_doc` or anyting you're used to passing to [rvest::html_table()]
#' @param raw if `TRUE` then a `list` with rows and cells will be returned. Each
#' cell has the value in the source HTML table but also has an `hattr`
#' attribute (short for "html entity attribute") which contains all the
#' attributes (if any) of the table cell. Each row in the list also has an `hattr`
#' attribute holding its attributes (if any). This structure may be useful
#' for doing more infolved extractions of weirdly formed HTML tables
#' without having to muck with XPath queries. Default: `FALSE`
#' @param trim if `TRUE` trim cell whitespace. Default: `FALSE`.
#' @export
#' @examples
#' x <- reap_url("https://en.wikipedia.org/wiki/Demography_of_the_United_Kingdom")
#'
#' # take advantage of the pre-processing reap_url() does:
#' tbl <- reap_table(x$tag$table[[10]])
#' tbl_raw <- reap_table(x$tag$table[[10]], raw=TRUE)
#'
#' # get all of 'em:
#' tbls <- reap_table(x)
#'
#' # fid a specific one:
#' reap_node(x, ".//table[contains(., 'Other identity and at least one UK identity')]") %>%
#' reap_table() -> tbl
reap_table <- function(x, raw=FALSE, trim=TRUE) UseMethod("reap_table")
#' @export
reap_table.reapr_doc <- function(x, raw=FALSE, trim=TRUE) {
x <- validate_parsed_content(x)
reap_table(x$parsed_html, raw, trim)
}
#' @export
reap_table.xml_document <- function(x, raw=FALSE, trim=TRUE) {
tbls <- xml2::xml_find_all(x, ".//table")
lapply(tbls, reap_table, raw, trim)
}
#' @export
reap_table.xml_nodeset <- function(x, raw=FALSE, trim=TRUE) {
lapply(x, reap_table, raw, trim)
}
#' @export
reap_table.xml_node <- function(x, raw=FALSE, trim=TRUE) {
stopifnot(xml2::xml_name(x) == "table")
trs <- xml_find_all(x, ".//tr")
lapply(trs, function(.x) {
xml_find_all(.x, ".//td|.//th") %>%
lapply(function(.x) {
val <- xml_text(.x, trim=trim)
attr(val, "hattr") <- xml_attrs(.x)
class(val) <- c("reapr_tbl_cell", "list")
val
}) -> row
attr(row, "hattr") <- xml_attrs(.x)
class(row) <- c("reapr_tbl_row", "list")
row
}) -> tblist
attr(tblist, "hattr") <- xml_attrs(x)
class(tblist) <- c("reapr_raw_tbl", "list")
if (raw) return(tblist)
row_count <- length(tblist)
col_count <- max(sapply(tblist, length))
mtbl <- matrix(data = NA_character_, nrow=row_count, ncol=col_count)
for (ridx in seq_along(tblist)) {
row <- tblist[[ridx]]
cofs <- 0
for (cidx in seq_along(row)) {
col <- row[[cidx]] # actual value @ index in what was in the HTML
if (trim) col <- trimws(col)
cattrs <- attr(col, "hattr")
cspan <- as.integer(cattrs["colspan"] %na% 1) - 1 # doing a range later so 1=0, 2=1 for range addition
rspan <- as.integer(cattrs["rowspan"] %na% 1) - 1
# move over until not NA (implies a rowspan somewhere above current row)
repeat {
if ((cidx + cofs) > col_count) {
cofs <- cofs - 1
break
}
if (is.na(mtbl[ridx, cidx+cofs])) break # current position has NA so we can stop
cofs <- cofs + 1 # move over one
}
# cat("VAL: ", trimws(col), "\n", sep="")
# cat(" RC: ", row_count, "; ", ridx, ":", ridx+rspan, "\n", sep="")
# cat(" CC: ", length(row), "; ", (cidx+cofs), ":", (cidx+cofs+cspan), "\n\n", sep="")
if ((cofs+cspan) > length(row)) break
if ((cidx+cofs+cspan) > col_count) cspan <- 0
mtbl[ridx:(ridx+rspan), (cidx+cofs):(cidx+cofs+cspan)] <- col
cofs <- cofs + cspan
}
}
xdf <- as.data.frame(mtbl, stringsAsFactors = FALSE)
class(xdf) <- c("tbl_df", "tbl", "data.frame")
xdf
}
elip <- function(x, n=10) {
sapply(
x,
function(.x) if (nchar(.x) > n) sprintf("%s...", substr(.x, 1, n-1)) else .x,
USE.NAMES = FALSE
)
}
#' Print for reapr table elements
#'
#' @param x reapr raw table
#' @param ... ignored
#' @param indent how much to indent this element
#' @keywords internal
#' @export
print.reapr_raw_tbl <- function(x, ..., indent = 0) {
h <- attr(x, "hattr")
if (length(h) == 0) {
cat("<table (noattrs)>\n")
} else {
cat(
paste0(rep(" ", indent), collapse=""),
"<table ",
paste0(sprintf("%s=%s", names(h), shQuote(elip(h))), collapse = " "),
">\n",
sep=""
)
}
for (row in seq_along(x)) {
print(x[[row]], indent = indent + 2)
}
}
#' @rdname print.reapr_raw_tbl
#' @keywords internal
#' @export
print.reapr_tbl_row <- function(x, ..., indent = 0) {
h <- attr(x, "hattr")
if (length(h) == 0) {
cat(paste0(rep(" ", indent), collapse=""), "<row (noattrs)>\n", sep="")
} else {
cat(
paste0(rep(" ", indent), collapse=""),
"<row ",
paste0(sprintf("%s=%s", names(h), shQuote(elip(h))), collapse = " "),
">\n",
sep=""
)
}
for (cell in seq_along(x)) {
print(x[[cell]], indent = indent + 2)
}
}
#' @rdname print.reapr_raw_tbl
#' @keywords internal
#' @export
print.reapr_tbl_cell <- function(x, ..., indent = 0) {
h <- attr(x, "hattr")
if (length(h) == 0) {
cat(paste0(rep(" ", indent), collapse=""), "<cell (noattrs)>\n", sep="")
} else {
h <- as.list(h)
cat(
paste0(rep(" ", indent), collapse=""),
"<cell ",
paste0(sprintf("%s=%s", names(h), shQuote(elip(h))), collapse = " "),
">\n",
sep=""
)
}
}