4 ## This takes about 3:20 minutes on my X200 w/Debian Jessie 64-bit.
7 ################################################################################
9 ################################################################################
11 HTML.FILES.DIR <- "/opt/www.contemporaryartdaily.com"
14 ################################################################################
15 ## Regexes and text processing
16 ################################################################################
18 # Some shows don't have artist, some have one, some have lists
19 # Some weird whitespace in some Artists: </em> blocks,
20 # and some artists don't end with </p>, so do it like this:
21 ARTISTS <- "<em>Artists?:(.+?)</p>"
23 VENUE <- "<em>Venue:(.+?)</p>"
24 # C</em>entral nervous system... strip tags and remove label instead?
25 EXHIBITION.TITLE <- "<em>Exhibition Title:(.+?)</p>"
26 DATE <- "<em>Date:(.+?)</p>"
28 PRESS.RELEASE <- "<em>Press Release:(.+)<!-- Start Sociable -->"
29 IS.PRESS.RELEASE <- "<em>Press Release:"
30 IMAGES <- "<div id='gallery-1'(.+?)<\\/div>"
31 IMAGE <- "src=\"http://www.contemporaryartdaily.com/([^\"]+)\""
32 PAGENUM <- "index.html\\?p=(.+)"
34 getImages <- function(text) {
35 gallery <- str_extract(text, IMAGES)
36 matches <- str_match_all(gallery, IMAGE)
37 ## Failed match returns list of character(0)
38 if(! identical(matches[[1]], character(0))) {
41 ## Return empty vector for no matches
46 cleanHTML <- function(html) {
48 detagged <- str_trim(str_replace_all(html, TAG, ""))
49 ## Trim so we end up with an empty string if it's just space
50 cleaned <- str_trim(str_replace_all(detagged, "[\t\r\n ]", " "))
51 ## xpathApply doesn't like being called on just space or on an empty string
53 ## Convert all whitespace and linebreaks to spaces
54 ## This is two operations in one, always convert the character that
55 ## looks like _ even if we decide not to convert others.
56 ## Convert html entities to characters
57 xpathApply(htmlParse(cleaned, asText=TRUE, encoding="UTF-8"),
65 getSection <- function(text, exp) {
66 section <- str_match(text, exp)[,2]
67 if(! is.na(section)) {
74 getArtists <- function(text) {
75 section <- getSection(text, ARTISTS)
76 artist.list <- str_match_all(section, ARTIST)
77 ## Failed match returns list of character(0)
78 if(! identical(artist.list[[1]], character(0))) {
79 artists <- str_trim(artist.list[[1]][,2])
80 artists[artists != ""]
82 ## Return empty vector for no matches
87 getPageNum <- function(filepath) {
88 as.integer(str_match(filepath, PAGENUM)[,2])
91 getTitle <- function(text) {
92 getSection(text, EXHIBITION.TITLE)
95 getVenue <- function(text) {
96 getSection(text, VENUE)
99 getDate <- function(text) {
100 getSection(text, DATE)
103 getPressRelease <- function(text) {
104 getSection(text, PRESS.RELEASE)
107 isPressRelease <- function(text) {
108 str_detect(text, IS.PRESS.RELEASE)
112 ################################################################################
114 ################################################################################
116 slurp <- function(filepath) {
117 readChar(filepath, file.info(filepath)$size, useBytes=TRUE)
120 files <- list.files(path=HTML.FILES.DIR,
121 pattern="index.html\\?p=[0-9]+$",
128 press.releases <- NULL
130 ## Building with rbind is slow. Speed this up.
131 ## If we use list() rather than data.frame(), strings aren't quoted by write.csv
134 for(filepath in files) {
135 text <- slurp(filepath)
136 if (isPressRelease(text)) {
137 pagenum <- getPageNum(filepath)
139 press.release <- getPressRelease(text)
140 press.releases <- rbind(press.releases,
141 data.frame(page=pagenum,
142 press.release=press.release))
144 shows <- rbind(shows, data.frame(page=pagenum,
145 title=getTitle(text),
146 venue=getVenue(text),
148 show.artists <- getArtists(text)
149 if(length(show.artists) > 0) {
150 artists.rows <- data.frame(page=rep(pagenum, length(show.artists)),
152 artists <- rbind(artists, artists.rows)
155 show.images <- getImages(text)
156 if(length(show.images) > 0) {
157 images.rows <- data.frame(page=rep(pagenum, length(show.images)),
159 images <- rbind(images, images.rows)
165 ################################################################################
166 ## Post-processing and cross-referencing
167 ################################################################################
169 ## Get the start year for each show (or NA)
170 shows <- data.frame(shows, year = strtoi(str_match(shows$date, "[0-9]{4}")))
171 ## Fix errors in the data
172 shows$year[shows$year == 1013] <- 2013
173 shows$year[shows$year == 3013] <- 2013
175 ## Get the show start year for each artist's appearances
176 artists <- data.frame(artists,
177 year = shows[match(artists$page, shows$page),]$year)
179 ## Venues. Some breakeage, removing Kunst* helps but we should really used
180 ## named entity discovery for cities.
181 venue.locations <- sapply(str_split(as.character(shows$venue), ","),
182 function(x) {str_trim(tail(x, n = 1))})
183 venue.locations <- str_trim(str_replace(venue.locations, "Kunst[^ ]+", ""))
184 shows <- data.frame(shows, location = venue.locations)
186 image.location <- sapply(images$page,
188 shows[shows$page == x, "location"]
190 location.images <- data.frame(location = image.location, image = images$image)
191 location.images <- location.images[location.images$location != "",]
193 image.venue <- sapply(images$page,
195 shows[shows$page == x, "venue"]
197 venue.images <- data.frame(venue = image.venue, image = images$image)
198 venue.images <- venue.images[venue.images$venue != "",]
200 image.year <- sapply(images$page,
202 shows[shows$page == x, "year"]
204 year.images <- data.frame(year = image.year, image = images$image)
207 ################################################################################
209 ################################################################################
211 write.csv(shows, "csv/press-release-shows.csv", row.names=FALSE)
212 write.csv(artists, "csv/press-release-artists.csv", row.names=FALSE)
213 write.csv(images, "csv/press-release-images.csv", row.names=FALSE)
214 write.csv(press.releases, "csv/press-release-texts.csv", row.names=FALSE)
215 write.csv(location.images, "csv/location-images.csv", row.names=FALSE)
216 write.csv(venue.images, "csv/venue-images.csv", row.names=FALSE)
217 write.csv(year.images, "csv/year-images.csv", row.names=FALSE)