Initial commit.
[robmyers:contemporary-art-daily-analysis.git] / cad-text-analysis.Rmd
1 ```{r setup, include = FALSE}
2 library(tm)
3 library(topicmodels)
4 library(wordcloud)
5 library(stringr)
6 library(igraph)
7
8 library(SnowballC)
9 ##library(proxy)
10 ##library(Rgraphviz)
11
12 opts_chunk$set(fig.align = 'center') ## cache = TRUE,
13 ```
14
15 ```{r preparation, include = FALSE}
16 ################################################################################
17 # Load and generate data
18 ################################################################################
19
20 shows <- read.csv("csv/press-release-shows.csv", encoding = "UTF-8")
21 artists <- read.csv("csv/press-release-artists.csv", encoding = "UTF-8")
22 images <- read.csv("csv/press-release-images.csv", encoding = "UTF-8")
23 texts <- read.csv("csv/press-release-texts.csv", encoding = "UTF-8")
24
25 ################################################################################
26 ## Create corpus and matrices
27 ################################################################################
28
29 texts.corpus <- Corpus(VectorSource(texts$press.release))
30 texts.dict <- texts.corpus
31
32 tf.control <- list(tolower = TRUE,
33                    removePunctuation = TRUE,
34                    removeNumbers = TRUE,
35                    stopwords = stopwords("english"),
36                    ##stemming = TRUE,
37                    wordLengths = c(3, Inf),
38                    weighting = weightTf)
39
40 tfidf.control <- list(tolower = TRUE,
41                    removePunctuation = TRUE,
42                    removeNumbers = TRUE,
43                    stopwords = stopwords("english"),
44                    ##stemming = TRUE,
45                    wordLengths = c(3, Inf),
46                    weighting = weightTfIdf)
47
48 texts.tdm <- TermDocumentMatrix(texts.corpus, control = tf.control)
49 texts.tdm <- removeSparseTerms(texts.tdm, 0.9)
50
51 texts.dtm <- DocumentTermMatrix(texts.corpus, control = tf.control)
52 texts.dtm <- removeSparseTerms(texts.dtm, 0.9)
53
54 ################################################################################
55 ## Utility code
56 ################################################################################
57
58 ## Inspect without printing
59
60 inspectSilently <- function(source) {
61     as.matrix(source)
62 }
63
64 ## Summarize a single column as a table
65
66 summarize <- function(column, amount, a, b) {
67     the.summary <- summary(column)[1:amount]
68     the.summary <- the.summary[names(the.summary) !=  ""]
69     most.frequent <- data.frame(names(the.summary),
70                                       the.summary,
71                                       row.names = NULL)
72     names(most.frequent) <- c(a, b)
73     most.frequent
74 }
75
76 ## Inline htm-formatted frequencies from dtm rows (...documents)
77
78 tdmFrequenciesForIndex <- function(index, tdm, min.count = 0) {
79     freqs <- inspectSilently(tdm[, index])
80     gt.min <- 1:length(freqs[freqs > min.count])
81     decreasing.order <- order(freqs, decreasing = TRUE)[gt.min]
82     names.ordered <- rownames(freqs)[decreasing.order]
83     freqs.ordered <- freqs[decreasing.order]
84     descs <- paste(names.ordered, " (", freqs.ordered, ")", sep = "",
85                    collapse = ", ")
86     paste("<b>", colnames(tdm)[index], ":</b>", descs, collapse = "")
87 }
88
89 ## Inline a word cloud for each column/document
90
91 tdmWordClouds <- function(tdm, min.freq = 25) {
92     sapply(1:length(colnames(tdm)),
93             function(x) {
94                 cat("<h3>", colnames(tdm)[x], "</h3>\n")
95                 wordcloud(rownames(tdm),
96                 inspectSilently(tdm[,x]), min.freq = min.freq)
97             }) -> .null
98 }
99
100 ## Dendrogram-free heatmap
101
102 heatMap <- function(freq) {
103     norm.freq <- freq * (1.0 / max(freq))
104     par(mai = c(2.75,1.5,0.1,0.42))
105     image(t(norm.freq)[,dim(norm.freq)[1]:1], xaxt = 'n', yaxt = 'n',
106           bty = 'n', col = brewer.pal(9, "Blues"))
107     axis(1, at = seq(0,1,,dim(norm.freq)[2]), lty = 0,
108          labels = colnames(norm.freq), las = 2)
109     axis(2, at = seq(0,1,,dim(norm.freq)[1]), lty = 0,
110          labels = rev(rownames(norm.freq)), las = 2)
111 }
112
113 ## Inline table of popularity by year
114
115 mostPopularByYear <- function(source, column) {
116     year.counts <- table(source[c(column, "year")])
117     year.counts <- year.counts[rownames(year.counts) !=  "", ]
118     highest.counts <- year.counts[rowSums(year.counts) >=  7, ]
119     kable(highest.counts[order(rowSums(highest.counts), decreasing = TRUE), ])
120 }
121
122 ## Group the DTM into the given number of clusters
123
124 clusterMatrix <- function(dtm, dtm.names, cluster.count) {
125     clusters<-kmeans(dtm, cluster.count)
126     clusters.names<-lapply(1:cluster.count,
127         function(cluster){
128             dtm.names[clusters$cluster ==  cluster]})
129     paste(lapply(1:cluster.count,
130                  function(cluster){
131                     paste("Cluster", cluster, ":",
132                                      paste(unlist(clusters.names[cluster]),
133                                      collapse = ", "))
134                  }),
135              collapse = ".<br />\n")
136 }
137
138 ## Plot the graph in a nice style
139
140 plot.graph <- function (g, colour = "deepskyblue") {
141     ## We scale various properties by degree, so we get degree and max for this
142     degrees <- degree(g)
143     max.degree <- max(degrees)
144     ## 15 is the default - http://igraph.sourceforge.net/doc/R/plot.common.html
145     vertex.sizes <- (0.3 + (degrees * (0.7 / max.degree))) * 30
146
147     ##par(bg = "white")
148     ##par(mai = c(0.25, 0.25, 0.25, 0.25))
149     plot(g,
150          ##edge.width = 0.01,
151          ## This refuses to work as an edge property
152          ## lightgray was too pale for single lines on one graph
153          edge.color = colour,
154          edge.arrow.size = 0.0,
155          ##edge.curved = TRUE,
156          vertex.size = vertex.sizes,
157          vertex.frame.color = NA,
158          vertex.color = colour,
159          ##vertex.label.cex = vertex.sizes * 0.025,
160          vertex.label.family = "sans",
161          vertex.label.font = 2, ## bold
162          vertex.label.color = "black",
163          )
164 }
165
166 ## Convert the relationship table to a graph suitable for plotting
167
168 toGraph <- function (relationships) {
169     ## Create a graph from the table
170     g <- graph.edgelist(as.matrix(relationships), directed = FALSE)
171     ## Simplify the graph to remove self-loops
172     simplify(g, remove.multiple = FALSE, remove.loops = TRUE)
173 }
174
175 ## Remove small unconnected graphs / islands
176
177 removeIslands <- function (g) {
178     cl <- clusters(g)
179     induced.subgraph(g, cl$membership ==  1)
180 }
181
182 ## Filter out nodes by degree
183
184 filterIslands <- function (g, min.degree = 3) {
185     delete.vertices(g, which(degree(g) < min.degree))
186 }
187
188 ## Paste selected texts
189
190 pasteShowTexts <- function(index, match.field) {
191     pages <- shows$page[shows[,match.field] ==  index]
192     page.texts <- texts$press.release[texts$page %in% pages]
193     paste(page.texts, collapse = "\n")
194 }
195
196 ## Corpus to de-sparsed tf-idf
197
198 toTfIdf <- function(corpus) {
199     tfidf <- TermDocumentMatrix(corpus, control = tfidf.control)
200     removeSparseTerms(tfidf, 0.20)
201 }
202
203 ## Return a named list of details about shows
204 ## selected by the frequency of a particular field
205 ## pr ==  press releases
206
207 showDetails <- function(entries, pr, min.count = 10) {
208     pr.corpus <- Corpus(VectorSource(pr))
209
210     pr.tfidf <- toTfIdf(pr.corpus)
211
212     pr.freq <- inspectSilently(pr.tfidf)
213     colnames(pr.freq) <- entries
214
215     pr.tdm <- TermDocumentMatrix(pr.corpus, control = tf.control)
216     pr.tdm <- removeSparseTerms(pr.tdm, 0.1)
217     colnames(pr.tdm) <- entries
218
219     pr.dtm <- DocumentTermMatrix(pr.corpus, control = tf.control)
220     pr.dtm <- removeSparseTerms(pr.dtm, 0.1)
221
222     ## Format up most frequent words from the press releases
223     pr.descs <- sapply(1:length(entries),
224                                tdmFrequenciesForIndex,
225                                tdm = pr.tdm,
226                                min.count = min.count)
227
228     list(names = entries, texts = pr, tfidf = pr.tfidf,
229          tdm = pr.tdm, dtm = pr.dtm, freqs = pr.freq,
230          descs = pr.descs)
231 }
232
233 ## Get the details for the most popular shows by field
234
235 popularShowDetails <- function(field, min.count = 10) {
236     all.entries <- summary(shows[, field])
237     all.entries <- all.entries[names(all.entries) !=  ""]
238     entries <- names(all.entries)[1:20]
239     pr <- sapply(entries, function(x) {pasteShowTexts(x, field)})
240     showDetails(entries, pr, min.count)
241 }
242
243 ## Get the details for shows by year
244
245 yearShowDetails <- function(min.count = 10) {
246     all.years <- unique(shows$year)
247     all.years <- sort(all.years[! is.na(all.years)])
248     years.pr <- sapply(all.years, function(year) {pasteShowTexts(year, "year")})
249     showDetails(all.years, years.pr, min.count)
250 }
251
252 ## Make a matrix of artist/entity associations
253 ## e.g. artist/location. TRUE = artist has been in a show there. FALSE = hasn't.
254 ## If there are two shows called (e.g.) "Untitled", this give bogus results.
255
256 artistShowMatrix <- function(artists.to.use, field) {
257     ## These will be locations, shows, or venues
258     column.names <- unique(shows[,field])
259     column.names <- column.names[column.names !=  '']
260     occurrences <- matrix(FALSE,
261                           nrow = length(artists.to.use),
262                           ncol = length(column.names),
263                           dimnames = list(artists.to.use, column.names))
264     ## Nested for loop in R. Not sure how to vectorize
265     for (artist in as.character(artists.to.use)) {
266         artist.pages <- artists[artists$artist ==  artist, ]$page
267         for (column in column.names) {
268             ## Get the page numbers for the shows ,
269             pages <- shows[shows[,field] ==  column, ]$page
270             ## See if the artist is in a show with that page number
271             occurrences[artist, column] <- any(! is.na(match(pages,
272                                                              artist.pages)))
273         }
274     }
275     occurrences
276 }
277
278 ```
279
280
281 Contemporary Art Daily
282 ======================
283
284 ```{r press_releases_word_cloud, echo = FALSE}
285 texts.matrix<-as.matrix(texts.tdm)
286 texts.matrix.sorted<-sort(rowSums(texts.matrix), decreasing = TRUE)
287 texts.names<-names(texts.matrix.sorted)
288 texts.word.freqs<-data.frame(word = texts.names, freq = texts.matrix.sorted)
289
290 wordcloud(texts.word.freqs$word, texts.word.freqs$freq, min.freq = 250)
291 ```
292
293 Contemporary Art Daily (CAD) is a leading contemporary art blog that publishes documentation for selected shows of contemporary art. It was started in 2008 by then art student Forrest Nash, who describes the site as follows:
294
295 > Contemporary Art Daily is a website that publishes documentation of at least one contemporary art exhibition every day. We have an international purview, and we work hard to get especially high-quality documentation of the shows we publish.
296
297 Since `r min(shows$year, na.rm = TRUE)` CAD has published the details of more than `r floor(length(shows$page) / 100) * 100` shows including descriptive text, images of works included, and lists of artists involved in each show.
298
299 Nash describes the criteria used for selecting that documentation as follows:
300
301 > Our criteria for Contemporary Art Daily is complicated and not perfectly reducible, but I like to say that we are generally trying to balance two motives that sometimes conflict with each other. On the one hand, we do have a kind of journalistic motive: we hope to in some way represent the breadth of what is happening in contemporary art, even when a particular artist is not of personal interest to us. On the other hand, we have a curatorial motive, to advance art we believe in and think is important. I am usually more concerned about making a mistake and failing to see or include something than I am accidentally letting something through the filter that doesn’t belong.
302
303 (from: http://metropolism.com/features/why-contemporary-art-daily/).
304
305 As a curated resource, CAD is not a statistically representative population sample of all available contemporary art shows. Like a museum collection, a survey show or a textbook it is a mediated, value-laden view of the artworld. Its popularity demonstrates the appeal of this particular view to contemporary artworld audiences. Analyzing CAD is therefore a way of gaining an insight into one popular view of the contemporary artworld.
306
307 The html code of www.contemporaryartdaily.com was downloaded in January 2014 and processed with an R script to extract text and information from each post on the site announcing a show that fits their standard format. This data was then loaded by the R code in this file to generate the report you are now reading. For reasons of practicality and clarity Some analysis has been performed on the entire dataset, some on just the most popular entities (...most frequently occurring values) within it.
308
309 The presence or absence of surprises in the data may indicate fidelity or bias in the worldview of either Contemporary Art Daily or of the online contemporary artworld audience in relation to each other. The extent to which this generalizes to the culture or the reality of the wider contemporary artworld is open to question. Comparing CAD to the data of a more general art show resource website would provide evidence for this but is outside the scope of the current study. The reader's intuition will have to suffice on these matters for now.
310
311
312 Texts
313 =====
314
315 Word Frequency
316 --------------
317 *Words that occur 500 or more times in the corpus:* `r findFreqTerms(texts.tdm, lowfreq = 500)`.
318
319 *Words that occur 1000 or more times in the corpus:* `r findFreqTerms(texts.tdm, lowfreq = 1000)`.
320
321 *Words that occur 2000 or more times in the corpus:* `r findFreqTerms(texts.tdm, lowfreq = 2000)`.
322
323 Word Associations For Most Frequent Terms
324 -----------------------------------------
325 ```{r press_releases_frequent_terms_associations, echo = FALSE, results = "asis"}
326 describeAssocs <- function(assocList, term) {
327     assocs <- paste(names(assocList[[term]]), collapse = ", ")
328     ## <b> to make the term bold in html, * for markdown isn't parsed under asis
329     paste(c("<b>", term, ":", "</b> ", assocs), collapse = "")
330 }
331
332 assocs <- findAssocs(texts.dtm, findFreqTerms(texts.tdm, lowfreq = 2000), 0.25)
333 descs <- sapply(names(assocs),
334                 function(name) { describeAssocs(assocs, name) })
335
336 cat(descs, sep = "\n\n")
337 ```
338
339 ```{r press_releases_clustering, include = FALSE, cache = TRUE}
340 ## Text Clustering
341 ## Not currently working
342 ## 150 was the number found automatically
343 ## clusterCount<-10
344 ## clusters<-kmeans(texts.dtm, clusterCount)
345 ## clusters.texts<-lapply(1:clusterCount,
346 ##                           function(cluster){
347 ##                               texts$artist[clusters$cluster ==  cluster]})
348 ## ##clusters.texts
349 ## for(cluster in 1:clusterCount){
350 ##     cat("Cluster", cluster, ":",
351 ##         paste(unlist(clusters.texts[cluster]), collapse = ", "), "\n\n")
352 ## }
353 ```
354
355
356 Text Topic Modelling
357 --------------------
358 ```{r press_release_topic_modelling, echo = FALSE, results = "asis", cache = TRUE}
359 lda.control <- list(burnin = 1000,
360                     iter = 1000,
361                     keep = 50)
362 k <- 30
363 lda <- LDA(texts.dtm, k, method = "Gibbs", control = lda.control)
364 topic.terms <- terms(lda, 20)
365
366 topic.descs <- apply(topic.terms, 2, paste, collapse = ", ")
367 for(i in 1:length(topic.descs)) {
368     cat("<b>Topic ", i, ":</b> ", topic.descs[i], ".\n\n", sep = "")
369 }
370 ```
371
372 Text Vocabulary Over Time
373 =========================
374
375 ```{r press_releases_over_time, echo = FALSE}
376 years.details <- yearShowDetails(50)
377 ```
378
379 Year Vocabulary Word Frequencies
380 --------------------------------
381 `r paste(years.details$descs, collapse = ".\n\n")`
382
383 Year Vocabulary tf-idf Heatmap
384 ------------------------------
385 ```{r year_matrix_tfidf, echo = FALSE, results = "asis", fig.height = 24}
386 yearsfreq <- years.details$freq[rowSums(years.details$freq) > 0.00015, ]
387 heatMap(yearsfreq)
388 ```
389
390 Year Vocabulary Wordclouds
391 --------------------------
392 ```{r year_wordclouds, echo = FALSE, results = "asis"}
393 tdmWordClouds(years.details$tdm, 100)
394 ```
395
396
397 Artists
398 =======
399
400 Artist Popularity
401 -----------------
402 ```{r most_popular_artists, echo = FALSE, results = "asis"}
403 most.popular.artists <- summarize(artists$artist, 48, "Artist", "Shows")
404 kable(most.popular.artists)
405 ```
406
407 Artist Popularity By Year
408 -------------------------
409 ```{r most_popular_artists_years, echo = FALSE, results = "asis"}
410 mostPopularByYear(artists, "artist")
411 ```
412
413 Artist Clustering
414 -----------------
415 ```{r artist_cluster_shows, echo = FALSE, results = "asis", cache = TRUE}
416 artist.shows <- artistShowMatrix(most.popular.artists$Artist, 'title')
417 artist.venues <- artistShowMatrix(most.popular.artists$Artist, 'venue')
418 artist.venues <- artistShowMatrix(most.popular.artists$Artist, 'venue')
419 ```
420 We can't cluster artists by texts as the text may not refer to them uiquely.
421
422 So we cluster artists by show, venue, and city appearances.
423
424 **Show:**
425
426 `r clusterMatrix(artist.shows, most.popular.artists$Artist, 8)`
427
428 **Venue:**
429
430 `r clusterMatrix(artist.venues, most.popular.artists$Artist, 8)`
431
432 **City:**
433
434 `r clusterMatrix(artist.venues, most.popular.artists$Artist, 8)`
435
436
437 Venues
438 ======
439 ```{r venue_matrices, echo = FALSE}
440 popular.venues <- popularShowDetails("venue")
441 ```
442
443 Most Popular Venues
444 -------------------
445 ```{r most_popular_venues, echo = FALSE, results = "asis"}
446 kable(summarize(shows$venue, 35, "Venue", "Shows"))
447 ```
448
449 Most Popular Venues By Year
450 ---------------------------
451 ```{r most_popular_venue_years, echo = FALSE, results = "asis"}
452 mostPopularByYear(shows, "venue")
453 ```
454
455 Venue Word Frequencies
456 ----------------------
457 `r paste(popular.venues$descs, collapse = ".\n\n")`
458
459 Venue Clustering
460 ----------------
461
462 Clustering the most popular venues:
463
464 `r clusterMatrix(popular.venues$dtm, popular.venues$names, 5)`
465
466
467 Venue Wordclouds
468 ----------------
469 ```{r venue_wordclouds, echo = FALSE, results = "asis"}
470 tdmWordClouds(popular.venues$tdm, 20)
471 ```
472
473 Venue tf-idf Heatmap
474 --------------------
475 ```{r venue_matrix_tfidf, echo = FALSE, results = "asis", fig.height = 24}
476 venuesfreq <- popular.venues$freq[rowSums(popular.venues$freq) > 0.002, ]
477 heatMap(venuesfreq)
478 ```
479
480 Cities
481 ======
482
483 ```{r location_matrices, echo = FALSE}
484 popular.locations <- popularShowDetails("location")
485 ```
486
487 Most Popular Cities
488 -------------------
489 Naive city determination, we should clean this up somehow.
490 ```{r most_popular_venue_locations, echo = FALSE, results = "asis"}
491 kable(summarize(shows$location, 35, "Location", "Shows"))
492 ```
493
494 Most Popular Cities By Year
495 ---------------------------
496 ```{r most_popular_city_years, echo = FALSE, results = "asis"}
497 mostPopularByYear(shows, "location")
498 ```
499
500 City Word Frequencies
501 ---------------------
502 `r paste(popular.locations$descs, collapse = ".\n\n")`
503
504 City Clustering
505 ---------------
506 Clustering the most popular cities:
507
508 `r paste(clusterMatrix(popular.locations$dtm, popular.locations$names, 5))`
509
510 City tf-idf Heatmap
511 -------------------
512 ```{r location_matrix_tfidf, echo = FALSE, results = "asis", fig.height = 14}
513 locfreqs <- popular.locations$freqs[rowSums(popular.locations$freqs) > 0.002, ]
514 heatMap(locfreqs)
515 ```
516
517 City Wordclouds
518 ---------------
519 ```{r city_wordclouds, echo = FALSE, results = "asis"}
520 tdmWordClouds(popular.locations$tdm, 20)
521 ```
522
523
524 Graphs of show/gallery/artist links
525 ===================================
526
527 ```{r show_links_graphs, echo = FALSE}
528 popular.venues.shows <- shows[shows$venue %in% popular.venues$names, ]
529 popular.venues.shows.artists <- artists[artists$page %in% popular.venues.shows$page, ]
530 popular.venues.shows.locations <- shows$location[match(popular.venues.shows.artists$page, shows$page)]
531
532 popular.venues.shows.locations.artists <- data.frame(artist = popular.venues.shows.artists$artist,
533                                                      location = popular.venues.shows.locations)
534
535 popular.venues.shows.artists.unique <- unique(popular.venues.shows.artists$artist)
536
537 ## Generate a matrix of pairs of locations (cities)
538 ## representing each pair of cities an artist has exhibited in.
539 ## This is the reflexive cartesian product of the list of cities
540 ## for the artist.
541 ## This will therefore include city1:city1, probably many times
542 ## and will give city1:city2 many times if the artist has only shown
543 ## in city2 once.
544 ## Replace with a simple pair walk *or* weight better.
545
546 popular.locations.artists.links <- do.call(rbind,
547                 lapply(popular.venues.shows.artists.unique,
548                 function(artist) {
549                    locations <- popular.venues.shows.locations.artists$location[popular.venues.shows.locations.artists$artist  ==  artist]
550                    expand.grid(locations, locations)
551
552                 }))
553
554 ## Generate a matrix of pairs of shows each artist has exhibited in
555
556 popular.shows.artists.links <- do.call(rbind,
557                 lapply(popular.venues.shows.artists.unique,
558                 function(artist) {
559                    pages <- popular.venues.shows.artists$page[popular.venues.shows.artists$artist == artist]
560                    shows <- popular.venues.shows$title[popular.venues.shows$page %in% pages]
561                    ## Some shows have no title (or none was extracted)
562                    ## remove as otherwise this breaks the graph
563                    ## Also convert to Windoze encoding to avoid knitr breakage
564                    shows <- iconv(shows[shows != ''], 'utf-8', '')
565                    expand.grid(shows, shows)
566                 }))
567
568 ## Generate a matrix of pairs of venues each artist has exhibited in
569
570 popular.venues.artists.links <- do.call(rbind,
571                 lapply(popular.venues.shows.artists.unique,
572                 function(artist) {
573                    pages <- popular.venues.shows.artists$page[popular.venues.shows.artists$artist == artist]
574                    venues <- popular.venues.shows$venue[popular.venues.shows$page %in% pages]
575                    expand.grid(venues, venues)
576                 }))
577
578 popular.locations.graph <- toGraph(popular.locations.artists.links)
579 popular.venues.graph <- toGraph(popular.venues.artists.links)
580 popular.shows.graph <- toGraph(popular.shows.artists.links)
581 ```
582
583 Locations
584 ---------
585 ```{r locations_graph, echo = FALSE}
586 plot.graph(removeIslands(popular.locations.graph), colour = "plum")
587 ```
588
589 Venues
590 ------
591 ```{r venues_graph, echo = FALSE, fig.width = 10, fig.height = 10}
592 plot.graph(removeIslands(popular.venues.graph), colour = "darkgoldenrod1")
593 ```
594
595 Shows
596 -----
597 ```{r shows_graph, echo = FALSE, fig.width = 12, fig.height = 12}
598 plot.graph(filterIslands(popular.shows.graph, 10), colour = "darkturquoise")
599 ```