About
Sys.setenv(LANG = "en")
#library("rstudioapi") #to grab local position of the script
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
knitr::opts_knit$set(root.dir = '.')
#library("rvest") # to handle html stuff
library(lubridate) # to handle dates
library(ggplot2) # for plotting
library(cowplot) # for plotting
library(RColorBrewer) # for choosing colors
custompalette <- brewer.pal(n=8, name = 'Dark2')
library(knitr) # for tables
library(kableExtra) # for tables
library(lubridate) # for dates
library(plyr) # ddply, to summarize number of words by author
load('BSails_worksData.RData')
This is a document detailing analysis of Black Sails Ao3 tag data, collected on the 11 Aug 2020. I haven’t figured out a way to get my scrapper to log in into Ao3 (yet? rvest seems to have some trouble with page redirects), so results here are based on the works visible without authentication, which likely filters out preferentially explicit/problemantic works from the selection.
plot_bar <- function (data, columnX, legendPosition) {
ggplot(data, aes_string(x = columnX)) +
geom_bar(alpha=1)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y="Number of works")
}
plot_bar_color <- function (data, columnX, colColor, legendPosition) {
ggplot(data, aes_string(x = columnX, fill=colColor)) +
geom_bar(alpha=0.7)+
scale_fill_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y="Number of works")
}
plot_col <- function (data, columnX, columnY, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY)) +
geom_col(alpha=1)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y=gsub('\\.', ' ', columnY))
}
plot_col_color <- function (data, columnX, columnY, colColor, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY, fill=colColor)) +
geom_col(alpha=0.7)+
scale_fill_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y=gsub('\\.', ' ', columnY))
}
plot_percentiles <- function (data, columnX, columnY, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY)) +
geom_point(alpha=0.3)+
scale_y_log10(breaks = 10^c(0:15))+
scale_x_continuous(breaks = c(0, 25, 50, 75, 100))+ #scale_x_continuous(breaks = c(0:10)*10)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank())+
labs(x=gsub('\\.', ' ', columnX))
}
#title <- lapply(worksData, function(x) {x$Title})
author <- lapply(worksData, function(x) {x$Author})
fandom <- lapply(worksData, function(x) {x$Fandom})
rating <- lapply(worksData, function(x) {x$Rating})
warnings <- lapply(worksData, function(x) {x$Warnings})
category <- lapply(worksData, function(x) {x$Category})
WIP <- lapply(worksData, function(x) {x$WIP})
date <-lapply(worksData, function(x) {x$Date})
relationships <-lapply(worksData, function(x) {x$Relationships})
character <-lapply(worksData, function(x) {x$Character})
freeform <-lapply(worksData, function(x) {x$Freeform})
language <-lapply(worksData, function(x) {x$Language})
words <-lapply(worksData, function(x) {x$Words})
words[is.na(words)] <- 0
kudos <-lapply(worksData, function(x) {x$Kudos})
kudos[is.na(kudos)] <- 0
comments <-lapply(worksData, function(x) {x$Comments})
comments[is.na(comments)] <- 0
bookmarks<-lapply(worksData, function(x) {x$Bookmarks})
bookmarks[is.na(bookmarks)] <- 0
hits <-lapply(worksData, function(x) {x$Hits})
hits[is.na(hits)] <- 0
stats <- data.frame(Words = unlist(words, recursive = FALSE),
Comments= as.numeric(as.character(comments)),
Kudos = as.numeric(as.character(kudos)),
Bookmarks = as.numeric(as.character(bookmarks)),
Hits = as.numeric(as.character(hits)),
WIP = unlist(WIP, recursive = FALSE),
Rating = unlist(rating, recursive = FALSE),
Date = do.call("c", date))
stats$Rating <- factor(stats$Rating, levels = c("Not Rated", "General Audiences", "Teen And Up Audiences", "Mature", "Explicit"))
total <- 1000
percentile <- c(1:total)
percentileData <- data.frame(Works.Percentile = 100*(total - percentile)/total,
Words = unlist(lapply(percentile/total, quantile, x = unlist(words) )) + 1,
Hits = unlist(lapply(percentile/total, quantile, x = unlist(hits) )) + 1,
Kudos = unlist(lapply(percentile/total, quantile, x = unlist(kudos) )) + 1,
Comments = unlist(lapply(percentile/total, quantile, x = unlist(comments) )) + 1,
Bookmarks = unlist(lapply(percentile/total, quantile, x = unlist(bookmarks) )) + 1 )
rm(rating, kudos, comments, bookmarks, hits)
Timeline
Solid vertical lines on the graph indicate initial air dates, and dashed indicate final air dates, according to Wiki article.
We see a peak of activity after each season which builds up to a major peak after season 4. A minor bump in May 2020 could be attributed to coronavirus locdowns.
There are 3 works published before season 1 premier, one on 22 June 2013, which is probably an artifact of work import, the other two are dated 22 of January are tagged with “Anne Bonny/”Calico" Jack Rackham " and could be due to trailer hype and/or historical pirates fandom.
#data$Timestamp <- parse_date_time2(as.character(data$Timestamp), orders = "%d/%m/%Y %H:%M:%S")
#data$day <- as.Date(data$Timestamp)
seasonsStart <- c("2014-01-25", "2015-01-24", "2016-01-23", "2017-01-29")
seasonsStart <- as.Date(seasonsStart)
seasonsEnd <- c("2014-03-15", "2015-03-28", "2016-03-26", "2017-04-02")
seasonsEnd <- as.Date(seasonsEnd)
plotDatesDensityTotal <- ggplot(stats, aes(x = Date)) +
geom_density()+
geom_vline(xintercept=as.numeric(seasonsStart))+
geom_vline(xintercept=as.numeric(seasonsEnd), linetype ="longdash")+
scale_x_date(date_breaks="3 months")+
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'right')
plotDatesDensityTotal

rm(plotDatesDensityTotal)
If we plot Complete Works and Works in Progress separately, we still observe an overall peak structure in complete works, but, interestingly, Works in Progress follow the basic structure but don’t fluctuate much with new seasons.
plotDatesDensity <- ggplot(stats, aes(x = Date, col=WIP)) +
geom_density(alpha = 0.1)+
geom_vline(xintercept=seasonsStart)+
geom_vline(xintercept=seasonsEnd, linetype ="longdash")+
scale_x_date(date_breaks="3 months")+
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'right')
plotDatesDensity

rm(plotDatesDensity)
Engagement percentiles
Small plotting cheat: all the numbers on the Y axis are increased by 1 to include the case of 0 into the plot (otherwise excluded because of log scale).
- About 75% of works have more than a 1000 words, but only about 10% have more than 10000 words.
- Only about 25% of works have over a 1000 hits.
- Only about 25% of works have more than a 100 kudos.
- Only about 50% of works get more than 10 comments.
- Approximately 5% of works have no comments (tail end).
- Only approximately 35% of works get more than 10 bookmarks.
- Approximately 15% of works have no bookmarks (tail end).
wordsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Words', 'right')
hitsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Hits', 'right')
kudosPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Kudos', 'right')
commentsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Comments', 'right')
bookmarksPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Bookmarks', 'right')
plot_grid(wordsPercentiles + theme(legend.position="none"),
hitsPercentiles + theme(legend.position="none"),
kudosPercentiles + theme(legend.position="none"),
commentsPercentiles + theme(legend.position="none"),
bookmarksPercentiles + theme(legend.position="none") )

rm(total, percentile, percentileData, wordsPercentiles, hitsPercentiles, kudosPercentiles, commentsPercentiles, bookmarksPercentiles)
Complete Work vs Work in Progress distributions
- Majority of works are Complete
- Works in Progress are more than 3 times longer than Complete ones.
- Both Complete Works and Works in Progress get approximately the same amounts of hits.
- Complete Works get about 20% more kudos.
- Works in Progress get approximately 2 times as many comments as Complete ones (however, again, there’s no way to filter out author’s comments in the search selection).
- Complete Works get slightly more bookmarks than Works in Progress.
statsWIP <- stats
statsWIP$Divisor <- unlist(lapply(statsWIP$WIP, function(x) summary(statsWIP$WIP)[names(summary(statsWIP$WIP)) == x]))
statsWIP$Words.per.Work <- statsWIP$Words/statsWIP$Divisor
statsWIP$Hits.per.Work <- statsWIP$Hits/statsWIP$Divisor
statsWIP$Kudos.per.Work <- statsWIP$Kudos/statsWIP$Divisor
statsWIP$Comments.per.Work <- statsWIP$Comments/statsWIP$Divisor
statsWIP$Bookmarks.per.Work <- statsWIP$Bookmarks/statsWIP$Divisor
barWorksWIP <- plot_bar(statsWIP, 'WIP', 'right')
barWordsWIP <- plot_col(statsWIP, 'WIP', 'Words.per.Work', 'right')
barHitsWIP <- plot_col(statsWIP, 'WIP', 'Hits.per.Work', 'right')
barKudosWIP <- plot_col(statsWIP, 'WIP', 'Kudos.per.Work', 'right')
barCommentsWIP <- plot_col(statsWIP, 'WIP', 'Comments.per.Work', 'right')
barBookmarksWIP <- plot_col(statsWIP, 'WIP', 'Bookmarks.per.Work', 'right')
# plot_grid(plot_grid( barWorksWIP + theme(legend.position="none"),
# barWordsWIP + theme(legend.position="none"),
# barHitsWIP + theme(legend.position="none"),
# barKudosWIP + theme(legend.position="none"),
# barCommentsWIP + theme(legend.position="none"),
# barBookmarksWIP + theme(legend.position="none"),
# align = 'hv'),
# get_legend(barWorksWIP + theme(legend.title=element_blank())),
# rel_widths = c(4,1),
# align = 'hv')
plot_grid( barWorksWIP + theme(legend.position="none"),
barWordsWIP + theme(legend.position="none"),
barHitsWIP + theme(legend.position="none"),
barKudosWIP + theme(legend.position="none"),
barCommentsWIP + theme(legend.position="none"),
barBookmarksWIP + theme(legend.position="none"),
align = 'hv')

rm(statsWIP, barWorksWIP, barWordsWIP, barHitsWIP, barKudosWIP, barCommentsWIP, barBookmarksWIP)
Rating distributions
- Most works are rated T and E, but G and M rated works are not far behind.
- Works rated G are on average the shortest (~1500 words), followed by T (~4000 words), Not rated (~9000 words), E (~ 9000 words), and M (~11000 words). The trend of M rated works being the longest we observed in other fandoms holds here as well.
- G rated works get fewest hits (~500), followed by Not Rated (~700). Otherwise, the number of hits rises with the rating. E rated works are most popular (~1600).
- Number of kudos, comments and bookmarks seem to be broadly proportional to the number of hits.
statsRating <- stats
statsRating$Divisor <- unlist(lapply(statsRating$Rating, function(x) summary(statsRating$Rating)[names(summary(statsRating$Rating)) == x]))
statsRating$Words.per.Work <- statsRating$Words/statsRating$Divisor
statsRating$Hits.per.Work <- statsRating$Hits/statsRating$Divisor
statsRating$Kudos.per.Work <- statsRating$Kudos/statsRating$Divisor
statsRating$Comments.per.Work <- statsRating$Comments/statsRating$Divisor
statsRating$Bookmarks.per.Work <- statsRating$Bookmarks/statsRating$Divisor
barWorksRating <- plot_bar(statsRating, 'Rating', 'right')
barWordsRating <- plot_col(statsRating, 'Rating', 'Words.per.Work', 'right')
barHitsRating <- plot_col(statsRating, 'Rating', 'Hits.per.Work', 'right')
barKudosRating <- plot_col(statsRating, 'Rating', 'Kudos.per.Work', 'right')
barCommentsRating <- plot_col(statsRating, 'Rating', 'Comments.per.Work', 'right')
barBookmarksRating <- plot_col(statsRating, 'Rating', 'Bookmarks.per.Work', 'right')
plot_grid( barWorksRating + theme(legend.position="none"),
barWordsRating + theme(legend.position="none"),
barHitsRating + theme(legend.position="none"),
barKudosRating + theme(legend.position="none"),
barCommentsRating + theme(legend.position="none"),
barBookmarksRating + theme(legend.position="none"),
align = 'hv')

rm(statsRating, barWorksRating, barWordsRating, barHitsRating, barKudosRating, barCommentsRating, barBookmarksRating)
Categories
There are 3467 works tagged with a single category, and 860 tagged with 2 or more (up until all 6).
‘M/M’ is the most popular category, and it dwarfs all the others.
Multiple category fics strongly contribute towards ‘M/M’ count, then to ‘F/M’, ‘Gen’, and ‘F/F’, and only marginally to ‘Multi’ and ‘Other’.
singleCategorySummary <- summary(as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1])))
singleCategorySummary <- data.frame(Category = names(singleCategorySummary),
Number.of.Works = singleCategorySummary)
singleCategorySummary$Split <- "Single category"
multipleCategorySummary <- data.frame(Category = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'),
Number.of.Works = c(sum(grepl('Gen',category)),
sum(grepl('F/F',category)),
sum(grepl('F/M',category)),
sum(grepl('M/M',category)),
sum(grepl('Multi',category)),
sum(grepl('Other',category)),
sum(grepl('No category',category))) )
multipleCategorySummary$Split <- "All works"
categorySummary <- rbind(singleCategorySummary, multipleCategorySummary)
categorySummary$Category <- factor(categorySummary$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
categorySummary$Split <- factor(categorySummary$Split, levels = c("Single category", "All works"))
plotCategories <- ggplot(categorySummary, aes(x = Category, y = Number.of.Works)) +
geom_col(alpha=1)+
theme_half_open() +
background_grid() +
facet_wrap(.~Split) +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y="Number of Works")
plotCategories

rm(singleCategorySummary, multipleCategorySummary, categorySummary, plotCategories)
Engagement by a single category
For simplicity I’m only looking at works tagged with a single category here.
“Other” seems to have most words, despite being a tiny category, and collects quite a bit of Hits, Kudos, Comments and Bookmarks. It’s possible that a number of those works are collections of stories for many fandoms, which amplifies the engagement numbers.
‘M/M’ and ‘F/F’ works collect the most hits, but ‘F/F’ gets significantly fewer kudos, comments and bookmarks.
statsCategory <- stats[unlist(lapply(category, function(x) length(x))) == 1,]
statsCategory$Category <- as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1]))
statsCategory$Category <- factor(statsCategory$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
statsCategory$Divisor <- unlist(lapply(statsCategory$Category, function(x) summary(statsCategory$Category)[names(summary(statsCategory$Category)) == x]))
statsCategory$Words.per.Work <- statsCategory$Words/statsCategory$Divisor
statsCategory$Hits.per.Work <- statsCategory$Hits/statsCategory$Divisor
statsCategory$Kudos.per.Work <- statsCategory$Kudos/statsCategory$Divisor
statsCategory$Comments.per.Work <- statsCategory$Comments/statsCategory$Divisor
statsCategory$Bookmarks.per.Work <- statsCategory$Bookmarks/statsCategory$Divisor
statsCategory$Works.Percent <- 1/statsCategory$Divisor
barWorksCategory <- plot_bar_color(statsCategory, 'Category', 'Rating', 'right')
barWordsCategory <- plot_col_color(statsCategory, 'Category', 'Words.per.Work', 'Rating', 'right')
barHitsCategory <- plot_col_color(statsCategory, 'Category', 'Hits.per.Work', 'Rating', 'right')
barKudosCategory <- plot_col_color(statsCategory, 'Category', 'Kudos.per.Work', 'Rating', 'right')
barCommentsCategory <- plot_col_color(statsCategory, 'Category', 'Comments.per.Work', 'Rating', 'right')
barBookmarksCategory <- plot_col_color(statsCategory, 'Category', 'Bookmarks.per.Work','Rating', 'right')
plot_grid(plot_grid( barWorksCategory + theme(legend.position="none"),
barWordsCategory + theme(legend.position="none"),
barHitsCategory + theme(legend.position="none"),
barKudosCategory + theme(legend.position="none"),
barCommentsCategory + theme(legend.position="none"),
barBookmarksCategory + theme(legend.position="none"),
align = 'hv'),
get_legend(barWorksCategory + theme(legend.title=element_blank())),
rel_widths = c(4,1))

Ratings percentages by a single category
In absolute numbers “M/M” is the most popular category, and in relative numbers it gets the most explicit works.
plotWorksCategoryNormalized <- plot_col_color(statsCategory, 'Rating', 'Works.Percent', 'Rating', 'none')+
scale_y_continuous(labels=scales::percent)+
facet_wrap(.~Category)
plotWorksCategoryNormalized

rm(barWorksCategory, barWordsCategory, barHitsCategory, barKudosCategory, barCommentsCategory, barBookmarksCategory, plotWorksCategoryNormalized)
Single Category through time
Season 1 had a significant “F/F” bump, likely to “Eleanor Guthrie/Max” relationship. In season 2 we see a dip, but seasons 3 and 4 slowly build up to a strong peak, likely due to “Anne Bonny/Max” endgame. Season 1 is almonst non-existent in terms of “M/M” category, which may be attributed to early series build up being described as “straightbaiting”. Throughout seasons 2-4 it however builds up to a strong peak, with complex “Captain Flint/John Silver” developments and “Captain Flint/Thomas Hamilton” finale.
plotDatesRatingDensity <- ggplot(statsCategory, aes(x = Date, col=Category)) +
geom_density(alpha = 0.1)+
geom_vline(xintercept=seasonsStart)+
geom_vline(xintercept=seasonsEnd, linetype ="longdash")+
scale_x_date(date_breaks="3 months")+
scale_color_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'right')
plotDatesRatingDensity

rm(plotDatesRatingDensity)
Ship tags through time
Season 3 kickstarted the growth of popularity of “Captain Flint/John Silver”, which continued through season 4 and after, however at around March 2019 alternative tag “Captain Flint | James McGraw/John Silver” becomes more popular. Season 4 sees the rapid growth of “Captain Flint/Thomas Hamilton”, which similarly switches to alternative tag “Captain Flint/Thomas Hamilton” in March 2019. The shift in tagging most likely happened due to the efforts of Ao3 tag wranglers.
plotRelationships <- ggplot() +
geom_density(data = relationshipsStats[relationshipsStats$relationship1 > 0,], mapping=aes(x = Date), colour=custompalette[1])+
geom_density(data = relationshipsStats[relationshipsStats$relationship2 > 0,], mapping=aes(x = Date), colour=custompalette[2])+
geom_density(data = relationshipsStats[relationshipsStats$relationship3 > 0,], mapping=aes(x = Date), colour=custompalette[3])+
geom_density(data = relationshipsStats[relationshipsStats$relationship4 > 0,], mapping=aes(x = Date), colour=custompalette[4])+
geom_density(data = relationshipsStats[relationshipsStats$relationship5 > 0,], mapping=aes(x = Date), colour=custompalette[5])+
geom_density(data = relationshipsStats[relationshipsStats$relationship6 > 0,], mapping=aes(x = Date), colour=custompalette[6])+
geom_density(data = relationshipsStats[relationshipsStats$relationship7 > 0,], mapping=aes(x = Date), colour=custompalette[7])+
geom_density(data = relationshipsStats[relationshipsStats$relationship8 > 0,], mapping=aes(x = Date), colour=custompalette[8])+
geom_vline(xintercept=seasonsStart)+
geom_vline(xintercept=seasonsEnd, linetype ="longdash")+
scale_x_date(date_breaks="3 months")+
scale_color_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
mylegend <- get_legend(plotLegendRelationships)
plot_grid(plotRelationships, mylegend,
rel_widths = c(2,1), nrow=1)

#plotRelationships
#rm(seasons, plotDatesRatingDensity)
Archive Warnings
Majority of works are tagged with “No Archive Warnings Apply”, followed by a sizable fraction of “Creator Chose Not To Use Archive Warnings”. It seems to be a common matter of confusion between the usage of those two warnings, so it’s possible that a lot of “Creator Chose Not To Use Archive Warnings” are mistagged “No Archive Warnings Apply”.
multipleWarningSummary <- data.frame(Warning = c("No Archive Warnings Apply",
"Graphic Depictions Of Violence",
"Major Character Death",
"Rape/Non-Con",
"Underage",
"Creator Chose Not To Use Archive Warnings"),
Number.of.Works = c(sum(grepl("No Archive Warnings Apply",warnings)),
sum(grepl("Graphic Depictions Of Violence",warnings)),
sum(grepl("Major Character Death",warnings)),
sum(grepl("Rape/Non-Con",warnings)),
sum(grepl("Underage",warnings)),
sum(grepl("Creator Chose Not To Use Archive Warnings",warnings))) )
multipleWarningSummary$Warning <- factor(multipleWarningSummary$Warning, levels = c("No Archive Warnings Apply",
"Graphic Depictions Of Violence",
"Major Character Death",
"Rape/Non-Con",
"Underage",
"Creator Chose Not To Use Archive Warnings"))
plotWarnings <- plot_col(multipleWarningSummary, 'Warning', 'Number.of.Works', 'right')
plotWarnings

rm(multipleWarningSummary, plotWarnings)
Multiple Fandoms
Number of works tagged with more than 1 fandom is 181, but the number of works explicitly tagged as ‘crossover’ is just 42.
Authors by Works
Top 30 of most prolific authors in the tag by the number of stories as of data collection date:
topList <- 30
AuthorTable <- data.frame('Author' = names(summary(as.factor(unlist(author)))[1:topList]),
'Number of Stories' = summary(as.factor(unlist(author)))[1:topList])
row.names(AuthorTable) <- c()
kable(AuthorTable,
col.names = c('Author', 'Number of Stories'))
Author
|
Number of Stories
|
Magnetism_bind
|
200
|
Melis_Ash
|
118
|
OnlyOneWoman
|
98
|
iwtv
|
94
|
ElDiablito_SF
|
91
|
PrimalScream
|
90
|
VarjoRuusu
|
79
|
queerpyrate
|
61
|
WeeBeastie
|
61
|
DreamingPagan
|
49
|
mapped
|
48
|
meridian_rose (meridianrose)
|
48
|
agdhani
|
46
|
shirogiku
|
44
|
vowelinthug
|
40
|
lacecat
|
39
|
NahaFlowers
|
34
|
medusine
|
33
|
orphan_account
|
33
|
Sweety_Mutant
|
32
|
depugnare
|
31
|
Elisexyz
|
28
|
Arzani
|
25
|
Diana924
|
25
|
Palebluedot
|
23
|
khazadspoon
|
22
|
Lazurit
|
22
|
fosfomifira
|
21
|
jauneclair
|
21
|
NovaCaelum
|
21
|
rm(AuthorTable)
Top place is occupied by orphan_account, which is an artifact of archive’ works orphaning function.
Authors by Words
Only 69 works have more than one author. In cases where works had more than one author, I assumed that each of them contributed an equal amounts of words.
Top 30 of most prolific authors in the tag by the number of words written as of data collection date:
wordsByAuthor <- c()
for (i in 1:length(words)){
if (length(author[[i]]) > 1) {
wordsByAuthor <- c(wordsByAuthor, rep(words[[i]]/length(author[[5]]), length(author[[i]]) ) )
} else {
wordsByAuthor <- c(wordsByAuthor, words[[i]])
}
}
AuthorWordsTable <- data.frame('Author' = as.factor(unlist(author)),
'Words' = wordsByAuthor)
AuthorWordsSummary <- ddply(AuthorWordsTable, .(Author),
summarize,
Total.Words = sum(Words))
AuthorWordsSummary <- AuthorWordsSummary[order(AuthorWordsSummary$Total.Words, decreasing = TRUE),]
row.names(AuthorWordsSummary) <- c()
topList <- 30
kable(AuthorWordsSummary[1:topList,],
col.names = c('Author', 'Total Words'))
Author
|
Total Words
|
qqueenofhades
|
890452
|
LexyRomanova
|
856531
|
OnlyOneWoman
|
848757
|
Magnetism_bind
|
670789
|
iwtv
|
652764
|
lacecat
|
551184
|
PrimalScream
|
540335
|
DreamingPagan
|
461150
|
ElDiablito_SF
|
444795
|
vowelinthug
|
421236
|
queerpyrate
|
322172
|
Wind_Ryder
|
281640
|
brasspetal
|
266743
|
orphan_account
|
260828
|
VarjoRuusu
|
255679
|
WeeBeastie
|
225000
|
agdhani
|
203114
|
BehindBrokenWindows
|
200105
|
benchofindigo
|
194519
|
medusine
|
190964
|
sweetsunray
|
190186
|
Myheadisclear
|
182883
|
Thatswherethelightgetsin
|
182380
|
x_art
|
182019
|
Tarasque
|
181685
|
mapped
|
176418
|
seren_ccd
|
166441
|
StarRose
|
165105
|
Rising_Phoenix
|
161101
|
sebastianL (felix_atticus)
|
157171
|
rm(wordsByAuthor, i, AuthorWordsTable, AuthorWordsSummary)
Interestingly, orphan_account made it to the top by the number of words written as well.
Characters
Top 30 of the most popular characters:
topList <- 30
CharacterTable<- data.frame('Character' = names(summary(as.factor(unlist(character)))[1:topList]),
'Number of Stories' = summary(as.factor(unlist(character)))[1:topList])
row.names(CharacterTable) <- c()
kable(CharacterTable,
col.names = c('Character', 'Number of Stories'))
Character
|
Number of Stories
|
John Silver
|
2013
|
Captain Flint (Black Sails)
|
1919
|
Thomas Hamilton
|
1218
|
Billy Bones
|
776
|
Charles Vane
|
650
|
Miranda Barlow
|
633
|
Eleanor Guthrie
|
586
|
Anne Bonny
|
581
|
Max (Black Sails)
|
551
|
“Calico” Jack Rackham
|
524
|
Captain Flint
|
469
|
Captain Flint | James McGraw
|
444
|
Madi (Black Sails)
|
360
|
Hal Gates
|
188
|
Abigail Ashe
|
182
|
Woodes Rogers
|
154
|
James Flint
|
140
|
James McGraw
|
126
|
Ben Gunn
|
108
|
Blackbeard | Edward Teach
|
97
|
Idelle (Black Sails)
|
81
|
Peter Ashe
|
74
|
Original Characters
|
71
|
Muldoon (Black Sails)
|
66
|
Edward “Ned” Low
|
60
|
Dufresne (Black Sails)
|
55
|
Joji (Black Sails)
|
55
|
Dr. Howell (Black Sails)
|
49
|
Original Female Character(s)
|
49
|
Mr. Scott (Black Sails)
|
47
|
rm(CharacterTable)
Relationships
Top 30 of the most popular relationships:
I don’t have access to Ao3’s system of synonymous tags, so by virtue of text processing some relationship tags here are repeated.
Overwhelmingly, “Korra/Asami Sato”/“Korrasami”/“Korra/Asami” is the most popular relationship in LOK, contributing to popularity of “F/F” category. They are followed by “Korra/Mako (Avatar)”, and “Bolin/Opal (Avatar)”.
topList <- 30
RelationshipsTable<- data.frame('Relationship' = names(summary(as.factor(unlist(relationships)))[1:topList]),
'Number of Stories' = summary(as.factor(unlist(relationships)))[1:topList])
row.names(RelationshipsTable) <- c()
kable(RelationshipsTable,
col.names = c('Relationship', 'Number of Stories'))
Relationship
|
Number of Stories
|
Captain Flint/John Silver
|
1378
|
Captain Flint/Thomas Hamilton
|
811
|
Captain Flint | James McGraw/John Silver
|
229
|
Madi/John Silver
|
229
|
Captain Flint | James McGraw/Thomas Hamilton
|
218
|
Miranda Barlow/Captain Flint/Thomas Hamilton
|
214
|
Anne Bonny/“Calico” Jack Rackham
|
190
|
Anne Bonny/Max
|
183
|
Miranda Barlow/Captain Flint
|
169
|
Captain Flint/Thomas Hamilton/John Silver
|
155
|
Eleanor Guthrie/Max
|
146
|
Eleanor Guthrie/Charles Vane
|
135
|
Billy Bones/Captain Flint
|
121
|
Miranda Barlow/Thomas Hamilton
|
109
|
Thomas Hamilton/John Silver
|
80
|
James McGraw/Thomas Hamilton
|
73
|
Abigail Ashe/Billy Bones
|
69
|
Billy Bones/Ben Gunn
|
66
|
Eleanor Guthrie/Woodes Rogers
|
65
|
Miranda Barlow/Captain Flint | James McGraw/Thomas Hamilton
|
65
|
Anne Bonny/“Calico” Jack Rackham/Max
|
63
|
Captain Flint/Charles Vane
|
62
|
Miranda Barlow & Captain Flint & Thomas Hamilton
|
59
|
“Calico” Jack Rackham/Charles Vane
|
56
|
Billy Bones/Charles Vane
|
55
|
Captain Flint/Madi/John Silver
|
52
|
Captain Flint & John Silver
|
50
|
Captain Flint | James McGraw/Thomas Hamilton/John Silver
|
41
|
Billy Bones/John Silver
|
39
|
Miranda Barlow/Captain Flint | James McGraw
|
37
|
rm(RelationshipsTable)
Languages
Unsurprisingly, most works are written in English. Apologies for U+. kable package for whatever reason murders unicode characters. The two languages in question are Russian (Русский) and Chinese (中文).
#topList <- 30
languagesList <- summary(as.factor(unlist(language)))
LanguageTable <- data.frame('Language' = names(languagesList),
'Number of Stories' = languagesList )
LanguageTable <- LanguageTable[order(LanguageTable$Number.of.Stories, decreasing=TRUE),]
row.names(LanguageTable) <- c()
kable(LanguageTable,
col.names = c('Language', 'Number of Stories'))
Language
|
Number of Stories
|
English
|
3963
|
<U+0420><U+0443><U+0441><U+0441><U+043A><U+0438><U+0439>
|
226
|
Italiano
|
47
|
<U+4E2D><U+6587>
|
38
|
Español
|
22
|
Deutsch
|
13
|
Français
|
12
|
Polski
|
4
|
Ceština
|
1
|
Português brasileiro
|
1
|
#languagesList
#rm(LanguageTable)
---
title: "Ao3 data analysis for Black Sails"
author: "darthaline"
date: "11 Aug 2020"
output:
  html_notebook:
    code_folding: "hide"
    toc: true
---

# About

```{r setup, message = FALSE, warning=FALSE}

Sys.setenv(LANG = "en")
#library("rstudioapi") #to grab local position of the script
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
knitr::opts_knit$set(root.dir = '.')

#library("rvest") # to handle html stuff

library(lubridate) # to handle dates

library(ggplot2) # for plotting
library(cowplot) # for plotting
library(RColorBrewer) # for choosing colors

custompalette <- brewer.pal(n=8, name = 'Dark2')

library(knitr) # for tables
library(kableExtra) # for tables

library(lubridate) # for dates

library(plyr) # ddply, to summarize number of words by author

load('BSails_worksData.RData')

```

This is a document detailing analysis of [`r tagValue` Ao3 tag](https://archiveofourown.org/tags/Black%20Sails/works) data, collected on the 11 Aug 2020. I haven't figured out a way to get my scrapper to log in into Ao3 (yet? rvest seems to have some trouble with page redirects), so results here are based on the works visible without authentication, which likely filters out preferentially explicit/problemantic works from the selection.

```{r plottingFunctions, collapse=TRUE, warning=FALSE}

plot_bar <- function (data, columnX, legendPosition) {
    ggplot(data, aes_string(x = columnX)) + 
    geom_bar(alpha=1)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y="Number of works")
}

plot_bar_color <- function (data, columnX, colColor, legendPosition) {
    ggplot(data, aes_string(x = columnX, fill=colColor)) + 
    geom_bar(alpha=0.7)+
    scale_fill_manual(values = custompalette) +
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y="Number of works")
}

plot_col <- function (data, columnX, columnY, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY)) + 
    geom_col(alpha=1)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y=gsub('\\.', ' ', columnY))
  
}

plot_col_color <- function (data, columnX, columnY, colColor, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY, fill=colColor)) + 
    geom_col(alpha=0.7)+
    scale_fill_manual(values = custompalette) +
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y=gsub('\\.', ' ', columnY))
  
}

plot_percentiles <- function (data, columnX, columnY, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY)) + 
    geom_point(alpha=0.3)+
    scale_y_log10(breaks = 10^c(0:15))+
    scale_x_continuous(breaks = c(0, 25, 50, 75, 100))+ #scale_x_continuous(breaks = c(0:10)*10)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank())+
    labs(x=gsub('\\.', ' ', columnX))
}

```

```{r flatteningData, message = FALSE, warning=FALSE}
#title <- lapply(worksData, function(x) {x$Title})
author <- lapply(worksData, function(x) {x$Author})
fandom <- lapply(worksData, function(x) {x$Fandom})
rating <- lapply(worksData, function(x) {x$Rating})
warnings <- lapply(worksData, function(x) {x$Warnings})
category <- lapply(worksData, function(x) {x$Category})
WIP <- lapply(worksData, function(x) {x$WIP})
date <-lapply(worksData, function(x) {x$Date})
relationships <-lapply(worksData, function(x) {x$Relationships})
character <-lapply(worksData, function(x) {x$Character})
freeform <-lapply(worksData, function(x) {x$Freeform})
language <-lapply(worksData, function(x) {x$Language})
words <-lapply(worksData, function(x) {x$Words})
words[is.na(words)] <- 0
kudos <-lapply(worksData, function(x) {x$Kudos})
kudos[is.na(kudos)] <- 0
comments <-lapply(worksData, function(x) {x$Comments})
comments[is.na(comments)] <- 0
bookmarks<-lapply(worksData, function(x) {x$Bookmarks})
bookmarks[is.na(bookmarks)] <- 0
hits <-lapply(worksData, function(x) {x$Hits})
hits[is.na(hits)] <- 0

stats <- data.frame(Words = unlist(words, recursive = FALSE),
                    Comments= as.numeric(as.character(comments)),
                    Kudos = as.numeric(as.character(kudos)),
                    Bookmarks = as.numeric(as.character(bookmarks)),
                    Hits = as.numeric(as.character(hits)),
                    WIP = unlist(WIP, recursive = FALSE),
                    Rating = unlist(rating, recursive = FALSE),
                    Date = do.call("c", date))

stats$Rating <- factor(stats$Rating, levels = c("Not Rated", "General Audiences", "Teen And Up Audiences", "Mature", "Explicit"))

total <- 1000
percentile <- c(1:total)
percentileData <- data.frame(Works.Percentile = 100*(total - percentile)/total,
                             Words = unlist(lapply(percentile/total, quantile, x = unlist(words) )) + 1,
                             Hits = unlist(lapply(percentile/total, quantile, x = unlist(hits) )) + 1,
                             Kudos = unlist(lapply(percentile/total, quantile, x = unlist(kudos) )) + 1,
                             Comments = unlist(lapply(percentile/total, quantile, x = unlist(comments) )) + 1,
                             Bookmarks = unlist(lapply(percentile/total, quantile, x = unlist(bookmarks) )) + 1 )

rm(rating, kudos, comments, bookmarks, hits)

```

# Timeline

Solid vertical lines on the graph indicate initial air dates, and dashed indicate final air dates, according to [Wiki article](https://en.wikipedia.org/wiki/Black_Sails_(TV_series)#Plot).

We see a peak of activity after each season which builds up to a major peak after season 4. A minor bump in May 2020 could be attributed to coronavirus locdowns.

There are 3 works published before season 1 premier, one on 22 June 2013, which is probably an artifact of work import, the other two are dated 22 of January are tagged with "Anne Bonny/"Calico" Jack Rackham " and could be due to trailer hype and/or historical pirates fandom.

```{r timelineTotal, message = FALSE, fig.width=10, fig.height=6}

#data$Timestamp <- parse_date_time2(as.character(data$Timestamp), orders = "%d/%m/%Y %H:%M:%S")
#data$day <- as.Date(data$Timestamp)

seasonsStart <- c("2014-01-25", "2015-01-24", "2016-01-23", "2017-01-29")
seasonsStart <- as.Date(seasonsStart)
seasonsEnd <- c("2014-03-15", "2015-03-28", "2016-03-26", "2017-04-02")
seasonsEnd <- as.Date(seasonsEnd)

plotDatesDensityTotal <- ggplot(stats, aes(x = Date)) + 
                    geom_density()+
                    geom_vline(xintercept=as.numeric(seasonsStart))+
                    geom_vline(xintercept=as.numeric(seasonsEnd), linetype ="longdash")+
                    scale_x_date(date_breaks="3 months")+
                    theme_half_open() +
                    background_grid() +
                    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          legend.position = 'right')
plotDatesDensityTotal

rm(plotDatesDensityTotal)
```

If we plot Complete Works and Works in Progress separately, we still observe an overall peak structure in complete works, but, interestingly, Works in Progress follow the basic structure but don't fluctuate much with new seasons.

```{r timelineWIP, message = FALSE, fig.width=10, fig.height=6}

plotDatesDensity <- ggplot(stats, aes(x = Date, col=WIP)) + 
                    geom_density(alpha = 0.1)+
                    geom_vline(xintercept=seasonsStart)+
                    geom_vline(xintercept=seasonsEnd, linetype ="longdash")+
                    scale_x_date(date_breaks="3 months")+
                    theme_half_open() +
                    background_grid() +
                    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          legend.position = 'right')
plotDatesDensity

rm(plotDatesDensity)

```


# Engagement percentiles

Small plotting cheat: all the numbers on the Y axis are increased by 1 to include the case of 0 into the plot (otherwise excluded because of log scale).

* About 75% of works have more than a 1000 words, but only about 10% have more than 10000 words.
* Only about 25% of works have over a 1000 hits.
* Only about 25% of works have more than a 100 kudos.
* Only about 50% of works get more than 10 comments.
* Approximately 5% of works have no comments (tail end).
* Only approximately 35% of works get more than 10 bookmarks.
* Approximately 15% of works have no bookmarks (tail end).

```{r percentiles, message = FALSE}
wordsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Words', 'right')
hitsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Hits', 'right')
kudosPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Kudos', 'right')
commentsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Comments', 'right')
bookmarksPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Bookmarks', 'right')

plot_grid(wordsPercentiles + theme(legend.position="none"),
          hitsPercentiles + theme(legend.position="none"),
          kudosPercentiles + theme(legend.position="none"),
          commentsPercentiles + theme(legend.position="none"),
          bookmarksPercentiles + theme(legend.position="none") )

rm(total, percentile, percentileData, wordsPercentiles, hitsPercentiles, kudosPercentiles, commentsPercentiles, bookmarksPercentiles)

```

# Complete Work vs Work in Progress distributions

* Majority of works are Complete
* Works in Progress are more than 3 times longer than Complete ones.
* Both Complete Works and Works in Progress get approximately the same amounts of hits.
* Complete Works get about 20% more kudos.
* Works in Progress get approximately 2 times as many comments as Complete ones (however, again, there's no way to filter out author's comments in the search selection).
* Complete Works get slightly more bookmarks than Works in Progress.

```{r totalWorksWIP, message = FALSE, warning=FALSE, fig.width=8, fig.height=6}

statsWIP <- stats
statsWIP$Divisor <- unlist(lapply(statsWIP$WIP, function(x) summary(statsWIP$WIP)[names(summary(statsWIP$WIP)) == x]))
statsWIP$Words.per.Work <- statsWIP$Words/statsWIP$Divisor
statsWIP$Hits.per.Work <- statsWIP$Hits/statsWIP$Divisor
statsWIP$Kudos.per.Work <- statsWIP$Kudos/statsWIP$Divisor
statsWIP$Comments.per.Work <- statsWIP$Comments/statsWIP$Divisor
statsWIP$Bookmarks.per.Work <- statsWIP$Bookmarks/statsWIP$Divisor

barWorksWIP <- plot_bar(statsWIP, 'WIP', 'right')
barWordsWIP <- plot_col(statsWIP, 'WIP', 'Words.per.Work', 'right')
barHitsWIP <- plot_col(statsWIP, 'WIP', 'Hits.per.Work', 'right')
barKudosWIP <- plot_col(statsWIP, 'WIP', 'Kudos.per.Work', 'right')
barCommentsWIP <- plot_col(statsWIP, 'WIP', 'Comments.per.Work', 'right')
barBookmarksWIP <- plot_col(statsWIP, 'WIP', 'Bookmarks.per.Work', 'right')

# plot_grid(plot_grid( barWorksWIP + theme(legend.position="none"),
#                      barWordsWIP + theme(legend.position="none"),
#                      barHitsWIP + theme(legend.position="none"),
#                      barKudosWIP + theme(legend.position="none"),
#                      barCommentsWIP + theme(legend.position="none"),
#                      barBookmarksWIP + theme(legend.position="none"),
#                      align = 'hv'),
#           get_legend(barWorksWIP + theme(legend.title=element_blank())),
#           rel_widths = c(4,1),
#           align = 'hv')
plot_grid( barWorksWIP + theme(legend.position="none"),
           barWordsWIP + theme(legend.position="none"),
           barHitsWIP + theme(legend.position="none"),
           barKudosWIP + theme(legend.position="none"),
           barCommentsWIP + theme(legend.position="none"),
           barBookmarksWIP + theme(legend.position="none"),
           align = 'hv')

rm(statsWIP, barWorksWIP, barWordsWIP, barHitsWIP, barKudosWIP, barCommentsWIP, barBookmarksWIP)
```

# Rating distributions

* Most works are rated T and E, but G and M rated works are not far behind.
* Works rated G are on average the shortest (~1500 words), followed by T (~4000 words), Not rated (~9000 words), E (~ 9000 words), and M (~11000 words). The trend of M rated works being the longest we observed in other fandoms holds here as well.
* G rated works get fewest hits (~500), followed by Not Rated (~700). Otherwise, the number of hits rises with the rating. E rated works are most popular (~1600).
* Number of kudos, comments and bookmarks seem to be broadly proportional to the number of hits.


```{r totalWorksRating, message = FALSE, warning=FALSE, fig.width=8, fig.height=8}

statsRating <- stats
statsRating$Divisor <- unlist(lapply(statsRating$Rating, function(x) summary(statsRating$Rating)[names(summary(statsRating$Rating)) == x]))
statsRating$Words.per.Work <- statsRating$Words/statsRating$Divisor
statsRating$Hits.per.Work <- statsRating$Hits/statsRating$Divisor
statsRating$Kudos.per.Work <- statsRating$Kudos/statsRating$Divisor
statsRating$Comments.per.Work <- statsRating$Comments/statsRating$Divisor
statsRating$Bookmarks.per.Work <- statsRating$Bookmarks/statsRating$Divisor

barWorksRating <- plot_bar(statsRating, 'Rating', 'right')
barWordsRating <- plot_col(statsRating, 'Rating', 'Words.per.Work', 'right')
barHitsRating <- plot_col(statsRating, 'Rating', 'Hits.per.Work', 'right')
barKudosRating <- plot_col(statsRating, 'Rating', 'Kudos.per.Work', 'right')
barCommentsRating <- plot_col(statsRating, 'Rating', 'Comments.per.Work', 'right')
barBookmarksRating <- plot_col(statsRating, 'Rating', 'Bookmarks.per.Work', 'right')

plot_grid( barWorksRating + theme(legend.position="none"),
           barWordsRating + theme(legend.position="none"),
           barHitsRating + theme(legend.position="none"),
           barKudosRating + theme(legend.position="none"),
           barCommentsRating + theme(legend.position="none"),
           barBookmarksRating + theme(legend.position="none"),
           align = 'hv')

rm(statsRating, barWorksRating, barWordsRating, barHitsRating, barKudosRating, barCommentsRating, barBookmarksRating)
```

# Categories

There are `r length(category[unlist(lapply(category, function(x) length(x))) == 1])` works tagged with a single category, and `r length(category[unlist(lapply(category, function(x) length(x))) > 1])` tagged with 2 or more (up until all 6).

'M/M' is the most popular category, and it dwarfs all the others.

Multiple category fics strongly contribute towards 'M/M' count, then to 'F/M', 'Gen', and 'F/F', and only marginally to 'Multi' and 'Other'.

```{r categoriesBars, message = FALSE}

singleCategorySummary <- summary(as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1])))
singleCategorySummary <- data.frame(Category = names(singleCategorySummary),
                                    Number.of.Works = singleCategorySummary)
singleCategorySummary$Split <- "Single category"

multipleCategorySummary <- data.frame(Category = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'),
                              Number.of.Works = c(sum(grepl('Gen',category)),
                                                  sum(grepl('F/F',category)),
                                                  sum(grepl('F/M',category)),
                                                  sum(grepl('M/M',category)),
                                                  sum(grepl('Multi',category)),
                                                  sum(grepl('Other',category)),
                                                  sum(grepl('No category',category))) )
multipleCategorySummary$Split <- "All works"

categorySummary <- rbind(singleCategorySummary, multipleCategorySummary)
categorySummary$Category <- factor(categorySummary$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
categorySummary$Split <- factor(categorySummary$Split, levels = c("Single category", "All works"))

plotCategories <- ggplot(categorySummary, aes(x = Category, y = Number.of.Works)) + 
                  geom_col(alpha=1)+
                  theme_half_open() +
                  background_grid() +
                  facet_wrap(.~Split) +
                  theme(legend.title=element_blank(),
                        axis.title.x = element_blank(),
                        axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
                  labs(y="Number of Works")
plotCategories

rm(singleCategorySummary, multipleCategorySummary, categorySummary, plotCategories)

```

# Engagement by a single category

For simplicity I'm only looking at works tagged with a single category here.

"Other" seems to have most words, despite being a tiny category, and collects quite a bit of Hits, Kudos, Comments and Bookmarks. It's possible that a number of those works are collections of stories for many fandoms, which amplifies the engagement numbers.

'M/M' and 'F/F' works collect the most hits, but 'F/F' gets significantly fewer kudos, comments and bookmarks.

```{r categoriesSingleEngagement, message = FALSE, warning = FALSE, fig.width=10, fig.height=6}

statsCategory <- stats[unlist(lapply(category, function(x) length(x))) == 1,]
statsCategory$Category <- as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1]))
statsCategory$Category <- factor(statsCategory$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
statsCategory$Divisor <- unlist(lapply(statsCategory$Category, function(x) summary(statsCategory$Category)[names(summary(statsCategory$Category)) == x]))
statsCategory$Words.per.Work <- statsCategory$Words/statsCategory$Divisor
statsCategory$Hits.per.Work <- statsCategory$Hits/statsCategory$Divisor
statsCategory$Kudos.per.Work <- statsCategory$Kudos/statsCategory$Divisor
statsCategory$Comments.per.Work <- statsCategory$Comments/statsCategory$Divisor
statsCategory$Bookmarks.per.Work <- statsCategory$Bookmarks/statsCategory$Divisor
statsCategory$Works.Percent <- 1/statsCategory$Divisor

barWorksCategory <- plot_bar_color(statsCategory, 'Category', 'Rating', 'right')
barWordsCategory <- plot_col_color(statsCategory, 'Category', 'Words.per.Work', 'Rating', 'right')
barHitsCategory <- plot_col_color(statsCategory, 'Category', 'Hits.per.Work', 'Rating', 'right')
barKudosCategory <- plot_col_color(statsCategory, 'Category', 'Kudos.per.Work', 'Rating', 'right')
barCommentsCategory <- plot_col_color(statsCategory, 'Category', 'Comments.per.Work', 'Rating', 'right')
barBookmarksCategory <- plot_col_color(statsCategory, 'Category', 'Bookmarks.per.Work','Rating', 'right')

plot_grid(plot_grid( barWorksCategory + theme(legend.position="none"),
           barWordsCategory + theme(legend.position="none"),
           barHitsCategory + theme(legend.position="none"),
           barKudosCategory + theme(legend.position="none"),
           barCommentsCategory + theme(legend.position="none"),
           barBookmarksCategory + theme(legend.position="none"),
           align = 'hv'),
          get_legend(barWorksCategory + theme(legend.title=element_blank())),
          rel_widths = c(4,1))

```

# Ratings percentages by a single category

In absolute numbers "M/M" is the most popular category, and in relative numbers it gets the most explicit works.

```{r categoriesSingleEngagementPercent, message = FALSE, warning = FALSE, fig.width=10, fig.height=6}

plotWorksCategoryNormalized <- plot_col_color(statsCategory, 'Rating', 'Works.Percent', 'Rating', 'none')+
                               scale_y_continuous(labels=scales::percent)+
                               facet_wrap(.~Category)
plotWorksCategoryNormalized

rm(barWorksCategory, barWordsCategory, barHitsCategory, barKudosCategory, barCommentsCategory, barBookmarksCategory, plotWorksCategoryNormalized)

```

# Single Category through time

Season 1 had a significant "F/F" bump, likely to "Eleanor Guthrie/Max" relationship. In season 2 we see a dip, but seasons 3 and 4 slowly build up to a strong peak, likely due to "Anne Bonny/Max" endgame. Season 1 is almonst non-existent in terms of "M/M" category, which may be attributed to early series build up being described as "straightbaiting". Throughout seasons 2-4 it however builds up to a strong peak, with complex "Captain Flint/John Silver" developments and "Captain Flint/Thomas Hamilton" finale.

```{r singleRatingTime, message = FALSE, fig.width=10, fig.height=6}

plotDatesRatingDensity <- ggplot(statsCategory, aes(x = Date, col=Category)) + 
                    geom_density(alpha = 0.1)+
                    geom_vline(xintercept=seasonsStart)+
                    geom_vline(xintercept=seasonsEnd, linetype ="longdash")+
                    scale_x_date(date_breaks="3 months")+
                    scale_color_manual(values = custompalette) +
                    theme_half_open() +
                    background_grid() +
                    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          legend.position = 'right')
plotDatesRatingDensity

rm(plotDatesRatingDensity)
```

# Most popular ship tags

Two most popular ships in Black Sails are "Captain Flint/John Silver" ("Captain Flint | James McGraw/John Silver") and "Captain Flint/Thomas Hamilton" ("Captain Flint | James McGraw/Thomas Hamilton"). 

```{r shipsHistogram, message = FALSE, fig.width=8, fig.height=6}

topList <- 8
topRelationshipsTable<- data.frame('Relationship' = names(summary(as.factor(unlist(relationships)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(relationships)))[1:topList])
row.names(topRelationshipsTable) <- c()
topRelationshipsTable <- topRelationshipsTable[order(topRelationshipsTable$Number.of.Stories, decreasing = TRUE),]
topRelationshipsTable$Relationship <- factor(as.character(topRelationshipsTable$Relationship), levels=as.character(topRelationshipsTable$Relationship))

relationshipsStats <- data.frame(Date = stats$Date,
                                 relationship1 = rep(0, length(stats$Date)),
                                 relationship2 = rep(0, length(stats$Date)),
                                 relationship3 = rep(0, length(stats$Date)),
                                 relationship4 = rep(0, length(stats$Date)),
                                 relationship5 = rep(0, length(stats$Date)),
                                 relationship6 = rep(0, length(stats$Date)),
                                 relationship7 = rep(0, length(stats$Date)),
                                 relationship8 = rep(0, length(stats$Date)))
for (i in 1:topList){
  matchingVector <- lapply(relationships, match, table=as.character(topRelationshipsTable$Relationship[i]))
  matchingVector <- unlist(lapply(matchingVector, sum, na.rm=TRUE))
  relationshipsStats[i+1] <- matchingVector
}

#colnames(relationshipsStats)[2:9] <- gsub('/', '\\/', topRelationshipsTable$Relationship)

plotLegendRelationships <- ggplot(topRelationshipsTable, aes(x=Relationship, y=Number.of.Stories, fill=Relationship))+
  geom_col(alpha=0.7)+
  scale_fill_manual(values = custompalette)+
  theme_half_open() +
  background_grid() +
  labs(x="",y='Number of Stories')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotLegendRelationships+theme(legend.position = 'none')
```

# Ship tags through time

Season 3 kickstarted the growth of popularity of "Captain Flint/John Silver", which continued through season 4 and after, however at around March 2019 alternative tag "Captain Flint | James McGraw/John Silver" becomes more popular. Season 4 sees the rapid growth of "Captain Flint/Thomas Hamilton", which similarly switches to alternative tag "Captain Flint/Thomas Hamilton" in March 2019. The shift in tagging most likely happened due to the efforts of Ao3 tag wranglers.

```{r shipTagsTime, message = FALSE, fig.width=12, fig.height=6}

plotRelationships <- ggplot() +
    geom_density(data = relationshipsStats[relationshipsStats$relationship1 > 0,], mapping=aes(x = Date), colour=custompalette[1])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship2 > 0,], mapping=aes(x = Date), colour=custompalette[2])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship3 > 0,], mapping=aes(x = Date), colour=custompalette[3])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship4 > 0,], mapping=aes(x = Date), colour=custompalette[4])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship5 > 0,], mapping=aes(x = Date), colour=custompalette[5])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship6 > 0,], mapping=aes(x = Date), colour=custompalette[6])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship7 > 0,], mapping=aes(x = Date), colour=custompalette[7])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship8 > 0,], mapping=aes(x = Date), colour=custompalette[8])+
    geom_vline(xintercept=seasonsStart)+
    geom_vline(xintercept=seasonsEnd, linetype ="longdash")+
    scale_x_date(date_breaks="3 months")+
    scale_color_manual(values = custompalette) +
    theme_half_open() +
    background_grid() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

mylegend <- get_legend(plotLegendRelationships)

plot_grid(plotRelationships, mylegend,
          rel_widths = c(2,1), nrow=1)
#plotRelationships

#rm(seasons, plotDatesRatingDensity)
```

# Archive Warnings

Majority of works are tagged with "No Archive Warnings Apply", followed by a sizable fraction of "Creator Chose Not To Use Archive Warnings". It seems to be a common matter of confusion between the usage of those two warnings, so it's possible that a lot of "Creator Chose Not To Use Archive Warnings" are mistagged "No Archive Warnings Apply".

```{r warningBars, message = FALSE, fig.width=6, fig.height=6}

multipleWarningSummary <- data.frame(Warning = c("No Archive Warnings Apply",
                                                  "Graphic Depictions Of Violence",
                                                  "Major Character Death",
                                                  "Rape/Non-Con",
                                                  "Underage",
                                                  "Creator Chose Not To Use Archive Warnings"),
                              Number.of.Works = c(sum(grepl("No Archive Warnings Apply",warnings)),
                                                  sum(grepl("Graphic Depictions Of Violence",warnings)),
                                                  sum(grepl("Major Character Death",warnings)),
                                                  sum(grepl("Rape/Non-Con",warnings)),
                                                  sum(grepl("Underage",warnings)),
                                                  sum(grepl("Creator Chose Not To Use Archive Warnings",warnings))) )

multipleWarningSummary$Warning <- factor(multipleWarningSummary$Warning, levels = c("No Archive Warnings Apply",
                                                                                    "Graphic Depictions Of Violence",
                                                                                    "Major Character Death",
                                                                                    "Rape/Non-Con",
                                                                                    "Underage",
                                                                                    "Creator Chose Not To Use Archive Warnings"))

plotWarnings <- plot_col(multipleWarningSummary, 'Warning', 'Number.of.Works', 'right')
plotWarnings

rm(multipleWarningSummary, plotWarnings)

```

# Multiple Fandoms

Number of works tagged with more than 1 fandom is `r length(fandom[unlist(lapply(fandom, length)) > 1])`, but the number of works explicitly tagged as 'crossover' is just `r length(fandom[grep('crossover',freeform, ignore.case=TRUE)])`.

# Authors by Works

Top 30 of most prolific authors in the tag by the number of stories as of data collection date:

```{r authorsWorks, message = FALSE}
topList <- 30

AuthorTable <- data.frame('Author' = names(summary(as.factor(unlist(author)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(author)))[1:topList])
row.names(AuthorTable) <- c()

kable(AuthorTable,
      col.names = c('Author', 'Number of Stories'))

rm(AuthorTable)
```

Top place is occupied by orphan_account, which is an artifact of archive' works orphaning function.

# Authors by Words

Only `r sum(unlist(lapply(author, length))>1)` works have more than one author. In cases where works had more than one author, I assumed that each of them contributed an equal amounts of words.

Top 30 of most prolific authors in the tag by the number of words written as of data collection date:

```{r authorsWords, message = FALSE}

wordsByAuthor <- c()

for (i in 1:length(words)){
  if (length(author[[i]]) > 1) {
    wordsByAuthor <- c(wordsByAuthor, rep(words[[i]]/length(author[[5]]), length(author[[i]]) ) )
  } else {
    wordsByAuthor <- c(wordsByAuthor, words[[i]])
  }
}

AuthorWordsTable <- data.frame('Author' = as.factor(unlist(author)),
                               'Words' = wordsByAuthor)

AuthorWordsSummary <- ddply(AuthorWordsTable, .(Author), 
                            summarize, 
                            Total.Words = sum(Words))
AuthorWordsSummary <- AuthorWordsSummary[order(AuthorWordsSummary$Total.Words, decreasing = TRUE),]
row.names(AuthorWordsSummary) <- c()

topList <- 30

kable(AuthorWordsSummary[1:topList,],
      col.names = c('Author', 'Total Words'))

rm(wordsByAuthor, i, AuthorWordsTable, AuthorWordsSummary)
```

Interestingly, orphan_account made it to the top by the number of words written as well.

# Characters

Top 30 of the most popular characters:

```{r characters, message = FALSE}
topList <- 30
CharacterTable<- data.frame('Character' = names(summary(as.factor(unlist(character)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(character)))[1:topList])
row.names(CharacterTable) <- c()

kable(CharacterTable,
      col.names = c('Character', 'Number of Stories'))

rm(CharacterTable)
```

# Relationships

Top 30 of the most popular relationships:

I don't have access to Ao3's system of synonymous tags, so by virtue of text processing some relationship tags here are repeated.

Overwhelmingly, "Korra/Asami Sato"/"Korrasami"/"Korra/Asami" is the most popular relationship in LOK, contributing to popularity of "F/F" category. They are followed by "Korra/Mako (Avatar)", and "Bolin/Opal (Avatar)".

```{r relationships, message = FALSE}
topList <- 30
RelationshipsTable<- data.frame('Relationship' = names(summary(as.factor(unlist(relationships)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(relationships)))[1:topList])
row.names(RelationshipsTable) <- c()

kable(RelationshipsTable,
      col.names = c('Relationship', 'Number of Stories'))

rm(RelationshipsTable)
```

# Freeform tags

Top 30 of the most popular freeform tags:

```{r freeform, message = FALSE}
topList <- 30
FreeformTable<- data.frame('Freeform' = names(summary(as.factor(unlist(freeform)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(freeform)))[1:topList])
row.names(FreeformTable) <- c()

kable(FreeformTable,
      col.names = c('Freeform Tag', 'Number of Stories'))

rm(FreeformTable)
```

# Languages

Unsurprisingly, most works are written in English. Apologies for U+. kable package for whatever reason murders unicode characters. The two languages in question are Russian (Русский) and Chinese (中文).

```{r languages, message = FALSE}
#topList <- 30

languagesList <- summary(as.factor(unlist(language)))

LanguageTable <- data.frame('Language' = names(languagesList),
                            'Number of Stories' = languagesList )
LanguageTable <- LanguageTable[order(LanguageTable$Number.of.Stories, decreasing=TRUE),]
row.names(LanguageTable) <- c()

kable(LanguageTable,
      col.names = c('Language', 'Number of Stories'))

#languagesList

#rm(LanguageTable)
```

# Other links

[Ao3 data analysis for The Dragon Prince (Cartoon)](https://darthaline.github.io/Ao3SearchAnalysis/fandoms/TDP/TDP_processing_notebook.nb.html)

[Ao3 data analysis for Avatar: Legend of Korra](https://darthaline.github.io/Ao3SearchAnalysis/fandoms/LOK/LOK_processing_notebook.nb.html)

[Ao3 data analysis for Avatar: The Last Airbender](https://darthaline.github.io/Ao3SearchAnalysis/fandoms/ATLA/ATLA_processing_notebook.nb.html)

Ao3 data analysis for Black Sails

If you enjoyed my analysis, please, consider [buying me a coffee](https://ko-fi.com/D1D8RIG5) or some other beverage.