About
Sys.setenv(LANG = "en")
#library("rstudioapi") #to grab local position of the script
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
knitr::opts_knit$set(root.dir = '.')
#library("rvest") # to handle html stuff
library(lubridate) # to handle dates
library(ggplot2) # for plotting
library(cowplot) # for plotting
library(RColorBrewer) # for choosing colors
custompalette <- brewer.pal(n=8, name = 'Dark2')
library(knitr) # for tables
library(kableExtra) # for tables
library(lubridate) # for dates
library(plyr) # ddply, to summarize number of words by author
load('TDP_worksData.RData')
This is a document detailing analysis of The Dragon Prince (Cartoon) Ao3 tag data, collected on the 7 Aug 2020. I haven’t figured out a way to get my scrapper to log in into Ao3 (yet? rvest seems to have some trouble with page redirects), so results here are based on the works visible without authentication, which likely filters out preferentially explicit/problemantic works from the selection.
plot_bar <- function (data, columnX, legendPosition) {
ggplot(data, aes_string(x = columnX)) +
geom_bar(alpha=1)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y="Number of works")
}
plot_bar_color <- function (data, columnX, colColor, legendPosition) {
ggplot(data, aes_string(x = columnX, fill=colColor)) +
geom_bar(alpha=0.7)+
scale_fill_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y="Number of works")
}
plot_col <- function (data, columnX, columnY, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY)) +
geom_col(alpha=1)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y=gsub('\\.', ' ', columnY))
}
plot_col_color <- function (data, columnX, columnY, colColor, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY, fill=colColor)) +
geom_col(alpha=0.7)+
scale_fill_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y=gsub('\\.', ' ', columnY))
}
plot_percentiles <- function (data, columnX, columnY, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY)) +
geom_point(alpha=0.3)+
scale_y_log10(breaks = 10^c(0:15))+
scale_x_continuous(breaks = c(0, 25, 50, 75, 100))+ #scale_x_continuous(breaks = c(0:10)*10)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank())+
labs(x=gsub('\\.', ' ', columnX))
}
plot_density <- function (data, column, color_column, legendPosition) {
ggplot(data, aes_string(x = column, col=color_column)) +
geom_density(alpha = 0.1)+
scale_x_log10()+
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = legendPosition)
}
plot_points <- function (data, columnX, columnY, color_column, legendPosition) {
ggplot(data, aes_string(x = columnX, y = columnY, col=color_column)) +
geom_point(alpha=0.3)+
scale_x_log10()+
scale_y_log10()+
facet_wrap(color_column)+
theme_half_open() +
background_grid() +
theme(legend.title=element_blank(),
axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
}
#title <- lapply(worksData, function(x) {x$Title})
author <- lapply(worksData, function(x) {x$Author})
fandom <- lapply(worksData, function(x) {x$Fandom})
rating <- lapply(worksData, function(x) {x$Rating})
warnings <- lapply(worksData, function(x) {x$Warnings})
category <- lapply(worksData, function(x) {x$Category})
WIP <- lapply(worksData, function(x) {x$WIP})
date <-lapply(worksData, function(x) {x$Date})
relationships <-lapply(worksData, function(x) {x$Relationships})
character <-lapply(worksData, function(x) {x$Character})
freeform <-lapply(worksData, function(x) {x$Freeform})
language <-lapply(worksData, function(x) {x$Language})
words <-lapply(worksData, function(x) {x$Words})
kudos <-lapply(worksData, function(x) {x$Kudos})
comments <-lapply(worksData, function(x) {x$Comments})
bookmarks<-lapply(worksData, function(x) {x$Bookmarks})
hits <-lapply(worksData, function(x) {x$Hits})
stats <- data.frame(Words = unlist(words, recursive = FALSE),
Comments= as.numeric(as.character(comments)),
Kudos = as.numeric(as.character(kudos)),
Bookmarks = as.numeric(as.character(bookmarks)),
Hits = as.numeric(as.character(hits)),
WIP = unlist(WIP, recursive = FALSE),
Rating = unlist(rating, recursive = FALSE),
Date = do.call("c", date))
stats$Rating <- factor(stats$Rating, levels = c("Not Rated", "General Audiences", "Teen And Up Audiences", "Mature", "Explicit"))
total <- 1000
percentile <- c(1:total)
percentileData <- data.frame(Works.Percentile = 100*(total - percentile)/total,
Words = unlist(lapply(percentile/total, quantile, x = unlist(words))) + 1,
Hits = unlist(lapply(percentile/total, quantile, x = unlist(hits))) + 1,
Kudos = unlist(lapply(percentile/total, quantile, x = unlist(kudos))) + 1,
Comments = unlist(lapply(percentile/total, quantile, x = unlist(comments))) + 1,
Bookmarks = unlist(lapply(percentile/total, quantile, x = unlist(bookmarks))) + 1 )
rm(rating, kudos, comments, bookmarks, hits)
Timeline
Vertical lines on the graph indicate season release dates according to Wiki article. As expected, after each new season, there’s a peak of activity which fades in about 2 months. Curiously, a few works were posted before official season 1 release. This may be attributed to the series trailer drop in July 2018 at the San Diego Comic-Con.
#data$Timestamp <- parse_date_time2(as.character(data$Timestamp), orders = "%d/%m/%Y %H:%M:%S")
#data$day <- as.Date(data$Timestamp)
seasons <- c("2018-09-14", "2019-02-15", "2019-11-22")
seasons <- as.Date(seasons)
dates <- data.frame(date = do.call("c", date),
WIP = unlist(WIP))
plotDatesDensityTotal <- ggplot(stats, aes(x = Date)) +
geom_density(alpha = 0.1)+
geom_vline(xintercept=seasons)+
scale_x_date(date_breaks="2 months")+
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'right')
plotDatesDensityTotal

rm(plotDatesDensityTotal)
It’s important to note, that I collect data from the Ao3 search page (rather than works pages, as it’s less disruptive), so I don’t have access to initial postage dates, only the latest updates. This means that the upward trend in works over time can be an artifact of series getting more popular, but also could be attributed to multichapter works drifting further in time due to updates.
If we plot Complete Works and Works in Progress separately, we still see an upward trend in both, but the slope characterising the growth of Work In progress peaks seems steeper than for Complete Works, which to me indicates at least partial effect of the multichapter drift.
plotDatesDensity <- ggplot(stats, aes(x = Date, col=WIP)) +
geom_density(alpha = 0.1)+
geom_vline(xintercept=seasons)+
scale_x_date(date_breaks="2 months")+
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'right')
plotDatesDensity

rm(plotDatesDensity)
Engagement percentiles
Small plotting cheat: all the numbers on the Y axis are increased by 1 to include the case of 0 into the plot (otherwise excluded because of log scale).
- About 75% of works have more than a 1000 words, but only about 10% have more than 10000 words.
- Only about 30% of works have over a 1000 hits.
- Only about 25% of works have more than a 100 kudos.
- Only about 40% of works get more than 10 comments.
- Approximately 10% of works have no comments (tail end).
- Only approximately 25% of works get more than 10 bookmarks.
- Approximately 20% of works have no bookmarks (tail end).
wordsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Words', 'right')
hitsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Hits', 'right')
kudosPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Kudos', 'right')
commentsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Comments', 'right')
bookmarksPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Bookmarks', 'right')
plot_grid(wordsPercentiles + theme(legend.position="none"),
hitsPercentiles + theme(legend.position="none"),
kudosPercentiles + theme(legend.position="none"),
commentsPercentiles + theme(legend.position="none"),
bookmarksPercentiles + theme(legend.position="none"),
get_legend(kudosPercentiles +
theme(legend.title=element_blank())))

rm(total, percentile, percentileData, wordsPercentiles, hitsPercentiles, kudosPercentiles, commentsPercentiles, bookmarksPercentiles)
Complete Work vs Work in Progress distributions
- There are approximately 4 times as many Complete Works as there are Works in Progress.
- Works in Progress are approximately 4 times longer than Complete ones.
- Works in Progress get approximately twice as many hits as Complete ones.
- Works in Progress get approximately 25% more kudos than Complete ones.
- Works in Progress get approximately 3 times as many comments as Complete ones (however there’s no way to filter out author’s comments in the search. selection, so this statistic should be taken with a grain of salt).
- Works in Progress get approximately twice as many bookmarks as Complete ones.
statsWIP <- stats
statsWIP$Divisor <- unlist(lapply(statsWIP$WIP, function(x) summary(statsWIP$WIP)[names(summary(statsWIP$WIP)) == x]))
statsWIP$Words.per.Work <- statsWIP$Words/statsWIP$Divisor
statsWIP$Hits.per.Work <- statsWIP$Hits/statsWIP$Divisor
statsWIP$Kudos.per.Work <- statsWIP$Kudos/statsWIP$Divisor
statsWIP$Comments.per.Work <- statsWIP$Comments/statsWIP$Divisor
statsWIP$Bookmarks.per.Work <- statsWIP$Bookmarks/statsWIP$Divisor
barWorksWIP <- plot_bar(statsWIP, 'WIP', 'right')
barWordsWIP <- plot_col(statsWIP, 'WIP', 'Words.per.Work', 'right')
barHitsWIP <- plot_col(statsWIP, 'WIP', 'Hits.per.Work', 'right')
barKudosWIP <- plot_col(statsWIP, 'WIP', 'Kudos.per.Work', 'right')
barCommentsWIP <- plot_col(statsWIP, 'WIP', 'Comments.per.Work', 'right')
barBookmarksWIP <- plot_col(statsWIP, 'WIP', 'Bookmarks.per.Work', 'right')
# plot_grid(plot_grid( barWorksWIP + theme(legend.position="none"),
# barWordsWIP + theme(legend.position="none"),
# barHitsWIP + theme(legend.position="none"),
# barKudosWIP + theme(legend.position="none"),
# barCommentsWIP + theme(legend.position="none"),
# barBookmarksWIP + theme(legend.position="none"),
# align = 'hv'),
# get_legend(barWorksWIP + theme(legend.title=element_blank())),
# rel_widths = c(4,1),
# align = 'hv')
plot_grid( barWorksWIP + theme(legend.position="none"),
barWordsWIP + theme(legend.position="none"),
barHitsWIP + theme(legend.position="none"),
barKudosWIP + theme(legend.position="none"),
barCommentsWIP + theme(legend.position="none"),
barBookmarksWIP + theme(legend.position="none"),
align = 'hv')

rm(statsWIP, barWorksWIP, barWordsWIP, barHitsWIP, barKudosWIP, barCommentsWIP, barBookmarksWIP)
wordsDensityWIP <- plot_density(stats, 'Words', 'WIP', 'right')
hitsDensityWIP <- plot_density(stats, 'Hits', 'WIP', 'right')
kudosDensityWIP <- plot_density(stats, 'Kudos', 'WIP', 'right')
commentsDensityWIP <- plot_density(stats, 'Comments', 'WIP', 'right')
bookmarksDensityWIP <- plot_density(stats, 'Bookmarks', 'WIP', 'right')
plot_grid(wordsDensityWIP + theme(legend.position="none"),
hitsDensityWIP + theme(legend.position="none"),
kudosDensityWIP + theme(legend.position="none"),
commentsDensityWIP + theme(legend.position="none"),
bookmarksDensityWIP + theme(legend.position="none"),
get_legend(kudosDensityWIP +
theme(legend.title=element_blank())))
rm(wordsDensityWIP, hitsDensityWIP, kudosDensityWIP, commentsDensityWIP, bookmarksDensityWIP)
wordsHitsWIP <- plot_points(stats, 'Words', 'Hits', 'WIP', 'right')
wordsHitsWIP
wordsKudosWIP <- plot_points(stats, 'Words', 'Kudos', 'WIP', 'right')
wordsKudosWIP
wordsCommentsWIP <- plot_points(stats, 'Words', 'Comments', 'WIP', 'right')
wordsCommentsWIP
wordsBookmarksWIP <- plot_points(stats, 'Words', 'Bookmarks', 'WIP', 'right')
wordsBookmarksWIP
rm(wordsHitsWIP, wordsKudosWIP, wordsCommentsWIP, wordsBookmarksWIP)
Rating distributions
- Works rated G make up the most numerous category (~1300), but they are on average the shortest (~2500 words) and get fewest hits(>1000), and fewest comments (~10).
- Works rated T follow in numbers (~900), but are significantly longer (~9000 words).
- Works rated M make for the smallest category (tied with Not Rated, at ~ 200), but are the longest (~13k words).
- Works rated E are few (~350), longer than G but shorter than T rated works (~6000 words), and get the most hits, but fewer kudos and significantly less comments than T and M rated works.
statsRating <- stats
statsRating$Divisor <- unlist(lapply(statsRating$Rating, function(x) summary(statsRating$Rating)[names(summary(statsRating$Rating)) == x]))
statsRating$Words.per.Work <- statsRating$Words/statsRating$Divisor
statsRating$Hits.per.Work <- statsRating$Hits/statsRating$Divisor
statsRating$Kudos.per.Work <- statsRating$Kudos/statsRating$Divisor
statsRating$Comments.per.Work <- statsRating$Comments/statsRating$Divisor
statsRating$Bookmarks.per.Work <- statsRating$Bookmarks/statsRating$Divisor
barWorksRating <- plot_bar(statsRating, 'Rating', 'right')
barWordsRating <- plot_col(statsRating, 'Rating', 'Words.per.Work', 'right')
barHitsRating <- plot_col(statsRating, 'Rating', 'Hits.per.Work', 'right')
barKudosRating <- plot_col(statsRating, 'Rating', 'Kudos.per.Work', 'right')
barCommentsRating <- plot_col(statsRating, 'Rating', 'Comments.per.Work', 'right')
barBookmarksRating <- plot_col(statsRating, 'Rating', 'Bookmarks.per.Work', 'right')
plot_grid( barWorksRating + theme(legend.position="none"),
barWordsRating + theme(legend.position="none"),
barHitsRating + theme(legend.position="none"),
barKudosRating + theme(legend.position="none"),
barCommentsRating + theme(legend.position="none"),
barBookmarksRating + theme(legend.position="none"),
align = 'hv')

rm(statsRating, barWorksRating, barWordsRating, barHitsRating, barKudosRating, barCommentsRating, barBookmarksRating)
wordsDensityRating <- plot_density(stats, 'Words', 'Rating', 'right')
hitsDensityRating <- plot_density(stats, 'Hits', 'Rating', 'right')
kudosDensityRating <- plot_density(stats, 'Kudos', 'Rating', 'right')
commentsDensityRating <- plot_density(stats, 'Comments', 'Rating', 'right')
bookmarksDensityRating <- plot_density(stats, 'Bookmarks', 'Rating', 'right')
plot_grid(wordsDensityRating + theme(legend.position="none"),
hitsDensityRating + theme(legend.position="none"),
kudosDensityRating + theme(legend.position="none"),
commentsDensityRating + theme(legend.position="none"),
bookmarksDensityRating + theme(legend.position="none"),
get_legend(kudosDensityRating +
theme(legend.title=element_blank())))
rm(wordsDensityRating, hitsDensityRating, kudosDensityRating, commentsDensityRating, bookmarksDensityRating)
wordsHitsRating <- plot_points(stats, 'Words', 'Hits', 'Rating', 'right')
wordsHitsRating
wordsKudosRating <- plot_points(stats, 'Words', 'Kudos', 'Rating', 'right')
wordsKudosRating
wordsCommentsRating <- plot_points(stats, 'Words', 'Comments', 'Rating', 'right')
wordsCommentsRating
wordsBookmarksRating <- plot_points(stats, 'Words', 'Bookmarks', 'Rating', 'right')
wordsBookmarksRating
rm(wordsHitsRating, wordsKudosRating, wordsCommentsRating, wordsBookmarksRating)
Categories
There are 2604 works tagged with a single category, and 444 tagged with 2 or more (up until all 6).
‘F/M’ is the most popular category, followed by ‘M/M’, ‘Gen’, and ‘F/F’.
Multiple category fics strongly contribute towards ‘F/M’ count, then to ‘M/M’, ‘Gen’, and ‘F/F’, and only marginally to ‘Multi’ and ‘Other’.
singleCategorySummary <- summary(as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1])))
singleCategorySummary <- data.frame(Category = names(singleCategorySummary),
Number.of.Works = singleCategorySummary)
singleCategorySummary$Split <- "Single category"
multipleCategorySummary <- data.frame(Category = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'),
Number.of.Works = c(sum(grepl('Gen',category)),
sum(grepl('F/F',category)),
sum(grepl('F/M',category)),
sum(grepl('M/M',category)),
sum(grepl('Multi',category)),
sum(grepl('Other',category)),
sum(grepl('No category',category))) )
multipleCategorySummary$Split <- "All works"
categorySummary <- rbind(singleCategorySummary, multipleCategorySummary)
categorySummary$Category <- factor(categorySummary$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
categorySummary$Split <- factor(categorySummary$Split, levels = c("Single category", "All works"))
plotCategories <- ggplot(categorySummary, aes(x = Category, y = Number.of.Works)) +
geom_col(alpha=1)+
theme_half_open() +
background_grid() +
facet_wrap(.~Split) +
theme(legend.title=element_blank(),
axis.title.x = element_blank(),
axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
labs(y="Number of Works")
plotCategories

rm(singleCategorySummary, multipleCategorySummary, categorySummary, plotCategories)
Engagement by a single category
For simplicity I’m only looking at works tagged with a single category here.
“Multi” seems to have most words, despite being a rather small category, and collects quite a bit of Hits and Comments. It’s possible that a number of those works are collections of stories for many fandoms, which amplifies the number of Hits and Comments, but that requires further investigation.
statsCategory <- stats[unlist(lapply(category, function(x) length(x))) == 1,]
statsCategory$Category <- as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1]))
statsCategory$Category <- factor(statsCategory$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
statsCategory$Divisor <- unlist(lapply(statsCategory$Category, function(x) summary(statsCategory$Category)[names(summary(statsCategory$Category)) == x]))
statsCategory$Words.per.Work <- statsCategory$Words/statsCategory$Divisor
statsCategory$Hits.per.Work <- statsCategory$Hits/statsCategory$Divisor
statsCategory$Kudos.per.Work <- statsCategory$Kudos/statsCategory$Divisor
statsCategory$Comments.per.Work <- statsCategory$Comments/statsCategory$Divisor
statsCategory$Bookmarks.per.Work <- statsCategory$Bookmarks/statsCategory$Divisor
statsCategory$Works.Percent <- 1/statsCategory$Divisor
barWorksCategory <- plot_bar_color(statsCategory, 'Category', 'Rating', 'right')
barWordsCategory <- plot_col_color(statsCategory, 'Category', 'Words.per.Work', 'Rating', 'right')
barHitsCategory <- plot_col_color(statsCategory, 'Category', 'Hits.per.Work', 'Rating', 'right')
barKudosCategory <- plot_col_color(statsCategory, 'Category', 'Kudos.per.Work', 'Rating', 'right')
barCommentsCategory <- plot_col_color(statsCategory, 'Category', 'Comments.per.Work', 'Rating', 'right')
barBookmarksCategory <- plot_col_color(statsCategory, 'Category', 'Bookmarks.per.Work','Rating', 'right')
plot_grid(plot_grid( barWorksCategory + theme(legend.position="none"),
barWordsCategory + theme(legend.position="none"),
barHitsCategory + theme(legend.position="none"),
barKudosCategory + theme(legend.position="none"),
barCommentsCategory + theme(legend.position="none"),
barBookmarksCategory + theme(legend.position="none"),
align = 'hv'),
get_legend(barWorksCategory + theme(legend.title=element_blank())),
rel_widths = c(4,1))

Ratings percentages by a single category
Out of the 3 main shipping categories in absolute numbers “M/M” has most E rated works, and “F/F” has the least, but in percentages of total works “F/F” and “F/M” are distributed almost identically, while “M/M” skews more towards M and E rated works.
plotWorksCategoryNormalized <- plot_col_color(statsCategory, 'Rating', 'Works.Percent', 'Rating', 'none')+
scale_y_continuous(labels=scales::percent)+
facet_wrap(.~Category)
plotWorksCategoryNormalized

rm(barWorksCategory, barWordsCategory, barHitsCategory, barKudosCategory, barCommentsCategory, barBookmarksCategory, plotWorksCategoryNormalized)
Single Category through time
Interestingly, season 1 sees a peak of ‘Gen’ category. Season 2 gives higher rise to ‘M/M’ (possibly related to Aaravos reveal and ‘Aaravos/Viren (The Dragon Prince)’ shipping) and ‘F/M’ (the rise of ‘Callum/Rayla (The Dragon Prince)’?), and season 3 is followed by a high rise of ‘F/F’ (following the development of ‘Amaya/Janai (The Dragon Prince)’ relationship) and a modest ‘M/M’ peak (‘Ethari/Runaan (The Dragon Prince)’ due to Ethari finally getting official name?).
plotDatesCategoryDensity <- ggplot(statsCategory, aes(x = Date, col=Category)) +
geom_density(alpha = 0.1)+
geom_vline(xintercept=seasons)+
scale_x_date(date_breaks="2 months")+
scale_color_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
legend.position = 'right')
plotDatesCategoryDensity

rm(plotDatesCategoryDensity)
Ship tags through time
“Amaya/Janai (The Dragon Prince)” sharply took of in popularity after season 3. Ethari’s name hasn’t been revealed until season 3, so authors were using “Runaan/Tinker | Necklace Elf (The Dragon Prince)” instead of “Ethari/Runaan (The Dragon Prince)”, but Ao3 would consider them synonymous. Romance tag “Callum/Rayla (The Dragon Prince)” is significantly more popular than platonic/frienship tag “Callum & Rayla (The Dragon Prince)”, however it’s worth noting that out of 230 stories tagged with a friendship/platonic one 178 are tagged with both, possibly making specifically friendship/platonic content more difficult to find.
plotRelationships <- ggplot() +
geom_density(data = relationshipsStats[relationshipsStats$relationship1 > 0,], mapping=aes(x = Date), colour=custompalette[1])+
geom_density(data = relationshipsStats[relationshipsStats$relationship2 > 0,], mapping=aes(x = Date), colour=custompalette[2])+
geom_density(data = relationshipsStats[relationshipsStats$relationship3 > 0,], mapping=aes(x = Date), colour=custompalette[3])+
geom_density(data = relationshipsStats[relationshipsStats$relationship4 > 0,], mapping=aes(x = Date), colour=custompalette[4])+
geom_density(data = relationshipsStats[relationshipsStats$relationship5 > 0,], mapping=aes(x = Date), colour=custompalette[5])+
geom_density(data = relationshipsStats[relationshipsStats$relationship6 > 0,], mapping=aes(x = Date), colour=custompalette[6])+
geom_density(data = relationshipsStats[relationshipsStats$relationship7 > 0,], mapping=aes(x = Date), colour=custompalette[7])+
geom_density(data = relationshipsStats[relationshipsStats$relationship8 > 0,], mapping=aes(x = Date), colour=custompalette[8])+
geom_vline(xintercept=seasons)+
scale_x_date(date_breaks="2 months")+
scale_color_manual(values = custompalette) +
theme_half_open() +
background_grid() +
theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
mylegend <- get_legend(plotLegendRelationships)
plot_grid(plotRelationships, mylegend,
rel_widths = c(2,1), nrow=1)

#plotRelationships
#rm(seasons, plotDatesRatingDensity)
Archive Warnings
Majority of works are tagged with “No Archive Warnings Apply”, followed by a sizable fraction of “Creator Chose Not To Use Archive Warnings”. It seems to be a common matter of confusion between the usage of those two warnings, so it’s possible that a lot of “Creator Chose Not To Use Archive Warnings” are mistagged “No Archive Warnings Apply”.
multipleWarningSummary <- data.frame(Warning = c("No Archive Warnings Apply",
"Graphic Depictions Of Violence",
"Major Character Death",
"Rape/Non-Con",
"Underage",
"Creator Chose Not To Use Archive Warnings"),
Number.of.Works = c(sum(grepl("No Archive Warnings Apply",warnings)),
sum(grepl("Graphic Depictions Of Violence",warnings)),
sum(grepl("Major Character Death",warnings)),
sum(grepl("Rape/Non-Con",warnings)),
sum(grepl("Underage",warnings)),
sum(grepl("Creator Chose Not To Use Archive Warnings",warnings))) )
multipleWarningSummary$Warning <- factor(multipleWarningSummary$Warning, levels = c("No Archive Warnings Apply",
"Graphic Depictions Of Violence",
"Major Character Death",
"Rape/Non-Con",
"Underage",
"Creator Chose Not To Use Archive Warnings"))
plotWarnings <- plot_col(multipleWarningSummary, 'Warning', 'Number.of.Works', 'right')
plotWarnings

rm(multipleWarningSummary, plotWarnings)
Multiple Fandoms
multiFandoms <- fandom[category == "Multi"]
multiFandomsAll <- fandom[grepl('Multi', category)]
severalFandoms <- fandom[unlist(lapply(fandom, length)) > 1]
crossovers <- fandom[grep('crossover',freeform, ignore.case=TRUE)]
Total number of works tagged only as ‘Multi’ is 63, but only 7 are tagged with more than one fandom. Among these, median number of fandoms tagged is 6, and at least one work is tagged with 15.
Number of works tagged with ‘Multi’ and/or other categories is 139, but only 21 are tagged with more than one fandom. Among these, median number of fandoms tagged is 14, and at least one work is tagged with 56.
Number of works tagged with more than 1 fandom is 133, however in some cases fandom tags used by the author are synonymous with The Dragon Prince (Cartoon), for example “The Dragon Prince”, “rayllum - Fandom”, “rayla x callum”, “TDP - Fandom”, “callum x rayla”.
Number of works explicitly tagged as ‘crossover’ is lower: 30. Out of multiple fandom tag works a significant amount are tagged with 2: 80, which, upon inspection, don’t seem overwhelmingly synonymous, so perhaps some authors simply don’t tag for crossovers.
Authors by Works
Top 30 of most prolific authors in the tag by the number of stories as of data collection date:
topList <- 30
AuthorTable <- data.frame('Author' = names(summary(as.factor(unlist(author)))[1:topList]),
'Number of Stories' = summary(as.factor(unlist(author)))[1:topList])
row.names(AuthorTable) <- c()
kable(AuthorTable,
col.names = c('Author', 'Number of Stories'))
Author
|
Number of Stories
|
planetundersiege
|
244
|
6Husbandos
|
40
|
Lady_Talla_Doe
|
37
|
orphan_account
|
33
|
brightsmoon
|
31
|
HootHalycon
|
30
|
beautifulterriblequeen
|
26
|
Lonespark_the_friendly_kraken
|
26
|
Symphoenae
|
26
|
propheticfire
|
23
|
Jelly
|
22
|
DelicateDragons
|
21
|
wordswithdragons
|
21
|
Porg_Master
|
19
|
Midnightdragon2
|
18
|
CuddlyCookie1360
|
17
|
UnsubstantiatedAssertion
|
17
|
AWillfullDroll
|
16
|
zuppi
|
16
|
Nightworldlove
|
15
|
im_the_king_of_the_ocean
|
14
|
Khaleesi_0f_Trolls
|
14
|
Mydarlingwriter
|
14
|
poetroe
|
14
|
spiritypowers
|
14
|
Aaravosa (Lokiiwood)
|
13
|
Anima_princess_1
|
13
|
Anonymous
|
13
|
Erratus
|
13
|
his_valentine
|
13
|
rm(AuthorTable)
Authors by Words
Only 48 works have more than one author. In cases where works had more than one author, I assumed that each of them contributed an equal amounts of words.
Top 30 of most prolific authors in the tag by the number of words written as of data collection date:
wordsByAuthor <- c()
for (i in 1:length(words)){
if (length(author[[i]]) > 1) {
wordsByAuthor <- c(wordsByAuthor, rep(words[[i]]/length(author[[5]]), length(author[[i]]) ) )
} else {
wordsByAuthor <- c(wordsByAuthor, words[[i]])
}
}
AuthorWordsTable <- data.frame('Author' = as.factor(unlist(author)),
'Words' = wordsByAuthor)
AuthorWordsSummary <- ddply(AuthorWordsTable, .(Author),
summarize,
Total.Words = sum(Words))
AuthorWordsSummary <- AuthorWordsSummary[order(AuthorWordsSummary$Total.Words, decreasing = TRUE),]
row.names(AuthorWordsSummary) <- c()
topList <- 30
kable(AuthorWordsSummary[1:topList,],
col.names = c('Author', 'Total Words'))
Author
|
Total Words
|
Decorated
|
500506.5
|
beautifulterriblequeen
|
313169.0
|
spontaneite
|
294914.0
|
Lodke
|
248405.0
|
Jelly
|
225759.0
|
DeeTheTeaDrinkingDragon
|
217848.2
|
NumptyPylon
|
184380.0
|
nonameforhire
|
175498.0
|
Khaleesi_0f_Trolls
|
166320.0
|
genericfanatic
|
162468.5
|
enbyred
|
158300.0
|
MagiesHeartLove
|
156268.0
|
prolixdreams
|
156264.0
|
The_Eternal_Winter
|
154539.0
|
wordswithdragons
|
144711.5
|
Symphoenae
|
144698.2
|
Kuno
|
144402.0
|
AWillfullDroll
|
139935.0
|
CuddlyCookie1360
|
137388.0
|
nautiscarader
|
134518.0
|
DelphinusDelphis
|
134126.0
|
Erratus
|
127890.0
|
Captain_Azoren
|
126415.0
|
Ocaj
|
125066.0
|
iwillhaveamoonbase
|
122299.0
|
DelicateDragons
|
118327.0
|
assassiinikissa
|
114339.0
|
BlehBlahBluh
|
113585.0
|
zuppi
|
112691.0
|
RosettaStarlight
|
109028.0
|
rm(wordsByAuthor, i, AuthorWordsTable, AuthorWordsSummary)
Characters
Top 30 of the most popular characters:
topList <- 30
CharacterTable<- data.frame('Character' = names(summary(as.factor(unlist(character)))[1:topList]),
'Number of Stories' = summary(as.factor(unlist(character)))[1:topList])
row.names(CharacterTable) <- c()
kable(CharacterTable,
col.names = c('Character', 'Number of Stories'))
Character
|
Number of Stories
|
Rayla (The Dragon Prince)
|
1455
|
Callum (The Dragon Prince)
|
1434
|
Viren (The Dragon Prince)
|
782
|
Ezran (The Dragon Prince)
|
735
|
Soren (The Dragon Prince)
|
687
|
Claudia (The Dragon Prince)
|
665
|
Runaan (The Dragon Prince)
|
601
|
Amaya (The Dragon Prince)
|
558
|
Aaravos (The Dragon Prince)
|
472
|
Harrow (The Dragon Prince)
|
399
|
Gren (The Dragon Prince)
|
340
|
Ethari (The Dragon Prince)
|
334
|
Janai (The Dragon Prince)
|
333
|
Azymondias (The Dragon Prince)
|
297
|
Sarai (The Dragon Prince)
|
179
|
Bait (The Dragon Prince)
|
160
|
Tinker | Necklace Elf (The Dragon Prince)
|
135
|
Corvus (The Dragon Prince)
|
121
|
Opeli (The Dragon Prince)
|
107
|
Original Characters
|
102
|
Marcos (The Dragon Prince)
|
85
|
Lain (The Dragon Prince)
|
75
|
Tiadrin (The Dragon Prince)
|
75
|
Original Female Character(s)
|
74
|
Kazi (The Dragon Prince)
|
71
|
Lujanne (The Dragon Prince)
|
70
|
Ibis (The Dragon Prince)
|
60
|
Reader
|
57
|
Ellis (The Dragon Prince)
|
53
|
Aanya (The Dragon Prince)
|
51
|
rm(CharacterTable)
Relationships
Top 30 of the most popular relationships:
I don’t have access to Ao3’s system of synonymous tags, so by virtue of text processing some relationship tags here are repeated.
Overwhelmingly, “Callum/Rayla (The Dragon Prince)” is the most popular relationship in TDP. They are followed by “Ethari/Runaan (The Dragon Prince)”, which is not immediately obvious due to common use of synonymous tags such as “Runaan/Tinker | Necklace Elf (The Dragon Prince)” and “Runaan/Ethari”. Third most popular relationship is “Amaya/Janai (The Dragon Prince)”.
topList <- 30
RelationshipsTable<- data.frame('Relationship' = names(summary(as.factor(unlist(relationships)))[1:topList]),
'Number of Stories' = summary(as.factor(unlist(relationships)))[1:topList])
row.names(RelationshipsTable) <- c()
kable(RelationshipsTable,
col.names = c('Relationship', 'Number of Stories'))
Relationship
|
Number of Stories
|
Callum/Rayla (The Dragon Prince)
|
1009
|
Amaya/Janai (The Dragon Prince)
|
273
|
Callum & Rayla (The Dragon Prince)
|
230
|
Ethari/Runaan (The Dragon Prince)
|
222
|
Aaravos/Viren (The Dragon Prince)
|
182
|
Runaan/Tinker | Necklace Elf (The Dragon Prince)
|
155
|
Harrow/Viren (The Dragon Prince)
|
119
|
Claudia/Rayla (The Dragon Prince)
|
113
|
Callum & Ezran & Rayla (The Dragon Prince)
|
100
|
Harrow/Sarai (The Dragon Prince)
|
87
|
Claudia & Soren (The Dragon Prince)
|
82
|
Callum & Ezran (The Dragon Prince)
|
77
|
Rayla & Runaan (The Dragon Prince)
|
67
|
Aaravos (The Dragon Prince)/Reader
|
55
|
Lain/Tiadrin (The Dragon Prince)
|
52
|
Aaravos & Viren (The Dragon Prince)
|
47
|
Amaya/Gren (The Dragon Prince)
|
47
|
Amaya & Gren (The Dragon Prince)
|
46
|
Callum & Soren (The Dragon Prince)
|
45
|
Callum & Claudia (The Dragon Prince)
|
40
|
Marcos/Soren (The Dragon Prince)
|
40
|
Ezran & Rayla (The Dragon Prince)
|
36
|
Rayllum - Relationship
|
32
|
Runaan/Ethari (The Dragon Prince)
|
32
|
Claudia & Rayla (The Dragon Prince)
|
31
|
Claudia & Viren (The Dragon Prince)
|
31
|
Callum & King Harrow (The Dragon Prince)
|
30
|
Gren & Runaan (The Dragon Prince)
|
29
|
Runaan/Ethari
|
29
|
Callum & Ezran & King Harrow (The Dragon Prince)
|
28
|
rm(RelationshipsTable)
Languages
Unsurprisingly, most works are written in English. Apologies for U+. kable package for whatever reason murders unicode characters. The two languages in question are Russian (Русский) and Chinese (中文).
#topList <- 30
languagesList <- summary(as.factor(unlist(language)))
LanguageTable <- data.frame('Language' = names(languagesList),
'Number of Stories' = languagesList )
LanguageTable <- LanguageTable[order(LanguageTable$Number.of.Stories, decreasing=TRUE),]
row.names(LanguageTable) <- c()
kable(LanguageTable,
col.names = c('Language', 'Number of Stories'))
Language
|
Number of Stories
|
English
|
2998
|
<U+0420><U+0443><U+0441><U+0441><U+043A><U+0438><U+0439>
|
12
|
Español
|
10
|
<U+4E2D><U+6587>
|
8
|
Français
|
8
|
Polski
|
6
|
Português brasileiro
|
3
|
Deutsch
|
1
|
Italiano
|
1
|
Suomi
|
1
|
#languagesList
#rm(LanguageTable)
---
title: "Ao3 data analysis for The Dragon Prince (Cartoon)"
author: "darthaline"
date: "7 Aug 2020"
output:
  html_notebook:
    code_folding: "hide"
    toc: true
---

# About

```{r setup, message = FALSE, warning=FALSE}

Sys.setenv(LANG = "en")
#library("rstudioapi") #to grab local position of the script
#setwd(dirname(rstudioapi::getActiveDocumentContext()$path))
knitr::opts_knit$set(root.dir = '.')

#library("rvest") # to handle html stuff

library(lubridate) # to handle dates

library(ggplot2) # for plotting
library(cowplot) # for plotting
library(RColorBrewer) # for choosing colors

custompalette <- brewer.pal(n=8, name = 'Dark2')

library(knitr) # for tables
library(kableExtra) # for tables

library(lubridate) # for dates

library(plyr) # ddply, to summarize number of words by author

load('TDP_worksData.RData')

```

This is a document detailing analysis of [`r tagValue` Ao3 tag](https://archiveofourown.org/tags/The%20Dragon%20Prince%20(Cartoon)/works) data, collected on the 7 Aug 2020. I haven't figured out a way to get my scrapper to log in into Ao3 (yet? rvest seems to have some trouble with page redirects), so results here are based on the works visible without authentication, which likely filters out preferentially explicit/problemantic works from the selection.

```{r plottingFunctions, collapse=TRUE, warning=FALSE}

plot_bar <- function (data, columnX, legendPosition) {
    ggplot(data, aes_string(x = columnX)) + 
    geom_bar(alpha=1)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y="Number of works")
}

plot_bar_color <- function (data, columnX, colColor, legendPosition) {
    ggplot(data, aes_string(x = columnX, fill=colColor)) + 
    geom_bar(alpha=0.7)+
    scale_fill_manual(values = custompalette) +
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y="Number of works")
}

plot_col <- function (data, columnX, columnY, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY)) + 
    geom_col(alpha=1)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y=gsub('\\.', ' ', columnY))
  
}

plot_col_color <- function (data, columnX, columnY, colColor, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY, fill=colColor)) + 
    geom_col(alpha=0.7)+
    scale_fill_manual(values = custompalette) +
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
    labs(y=gsub('\\.', ' ', columnY))
  
}

plot_percentiles <- function (data, columnX, columnY, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY)) + 
    geom_point(alpha=0.3)+
    scale_y_log10(breaks = 10^c(0:15))+
    scale_x_continuous(breaks = c(0, 25, 50, 75, 100))+ #scale_x_continuous(breaks = c(0:10)*10)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank())+
    labs(x=gsub('\\.', ' ', columnX))
}

```

```{r plottingFunctionsUnused, collapse=TRUE, warning=FALSE, eval = FALSE}

plot_density <- function (data, column, color_column, legendPosition) {
    ggplot(data, aes_string(x = column, col=color_column)) + 
    geom_density(alpha = 0.1)+
    scale_x_log10()+
    theme_half_open() +
    background_grid() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
          legend.position = legendPosition)
}

plot_points <- function (data, columnX, columnY, color_column, legendPosition) {
    ggplot(data, aes_string(x = columnX, y = columnY, col=color_column)) + 
    geom_point(alpha=0.3)+
    scale_x_log10()+
    scale_y_log10()+
    facet_wrap(color_column)+
    theme_half_open() +
    background_grid() +
    theme(legend.title=element_blank(),
          axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
}

```

```{r flatteningData, message = FALSE, warning=FALSE}
#title <- lapply(worksData, function(x) {x$Title})
author <- lapply(worksData, function(x) {x$Author})
fandom <- lapply(worksData, function(x) {x$Fandom})
rating <- lapply(worksData, function(x) {x$Rating})
warnings <- lapply(worksData, function(x) {x$Warnings})
category <- lapply(worksData, function(x) {x$Category})
WIP <- lapply(worksData, function(x) {x$WIP})
date <-lapply(worksData, function(x) {x$Date})
relationships <-lapply(worksData, function(x) {x$Relationships})
character <-lapply(worksData, function(x) {x$Character})
freeform <-lapply(worksData, function(x) {x$Freeform})
language <-lapply(worksData, function(x) {x$Language})
words <-lapply(worksData, function(x) {x$Words})
kudos <-lapply(worksData, function(x) {x$Kudos})
comments <-lapply(worksData, function(x) {x$Comments})
bookmarks<-lapply(worksData, function(x) {x$Bookmarks})
hits <-lapply(worksData, function(x) {x$Hits})

stats <- data.frame(Words = unlist(words, recursive = FALSE),
                    Comments= as.numeric(as.character(comments)),
                    Kudos = as.numeric(as.character(kudos)),
                    Bookmarks = as.numeric(as.character(bookmarks)),
                    Hits = as.numeric(as.character(hits)),
                    WIP = unlist(WIP, recursive = FALSE),
                    Rating = unlist(rating, recursive = FALSE),
                    Date = do.call("c", date))

stats$Rating <- factor(stats$Rating, levels = c("Not Rated", "General Audiences", "Teen And Up Audiences", "Mature", "Explicit"))

total <- 1000
percentile <- c(1:total)
percentileData <- data.frame(Works.Percentile = 100*(total - percentile)/total,
                             Words = unlist(lapply(percentile/total, quantile, x = unlist(words))) + 1,
                             Hits = unlist(lapply(percentile/total, quantile, x = unlist(hits))) + 1,
                             Kudos = unlist(lapply(percentile/total, quantile, x = unlist(kudos))) + 1,
                             Comments = unlist(lapply(percentile/total, quantile, x = unlist(comments))) + 1,
                             Bookmarks = unlist(lapply(percentile/total, quantile, x = unlist(bookmarks))) + 1 )

rm(rating, kudos, comments, bookmarks, hits)

```

# Timeline

Vertical lines on the graph indicate season release dates according to [Wiki article](https://en.wikipedia.org/wiki/The_Dragon_Prince#Episodes). As expected, after each new season, there's a peak of activity which fades in about 2 months. Curiously, a few works were posted before official season 1 release. This may be attributed to the series trailer drop in July 2018 at the San Diego Comic-Con.

```{r timelineTotal, message = FALSE, fig.width=10, fig.height=6}

#data$Timestamp <- parse_date_time2(as.character(data$Timestamp), orders = "%d/%m/%Y %H:%M:%S")
#data$day <- as.Date(data$Timestamp)

seasons <- c("2018-09-14", "2019-02-15", "2019-11-22")
seasons <- as.Date(seasons)

dates <- data.frame(date = do.call("c", date),
                    WIP = unlist(WIP))

plotDatesDensityTotal <- ggplot(stats, aes(x = Date)) + 
                    geom_density(alpha = 0.1)+
                    geom_vline(xintercept=seasons)+
                    scale_x_date(date_breaks="2 months")+
                    theme_half_open() +
                    background_grid() +
                    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          legend.position = 'right')
plotDatesDensityTotal

rm(plotDatesDensityTotal)
```

It's important to note, that I collect data from the Ao3 search page (rather than works pages, as it's less disruptive), so I don't have access to initial postage dates, only the latest updates. This means that the upward trend in works over time can be an artifact of series getting more popular, but also could be attributed to multichapter works drifting further in time due to updates.

If we plot Complete Works and Works in Progress separately, we still see an upward trend in both, but the slope characterising the growth of Work In progress peaks seems steeper than for Complete Works, which to me indicates at least partial effect of the multichapter drift.

```{r timelineWIP, message = FALSE, fig.width=10, fig.height=6}

plotDatesDensity <- ggplot(stats, aes(x = Date, col=WIP)) + 
                    geom_density(alpha = 0.1)+
                    geom_vline(xintercept=seasons)+
                    scale_x_date(date_breaks="2 months")+
                    theme_half_open() +
                    background_grid() +
                    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          legend.position = 'right')
plotDatesDensity

rm(plotDatesDensity)

```


# Engagement percentiles

Small plotting cheat: all the numbers on the Y axis are increased by 1 to include the case of 0 into the plot (otherwise excluded because of log scale).

* About 75% of works have more than a 1000 words, but only about 10% have more than 10000 words.
* Only about 30% of works have over a 1000 hits.
* Only about 25% of works have more than a 100 kudos.
* Only about 40% of works get more than 10 comments.
* Approximately 10% of works have no comments (tail end).
* Only approximately 25% of works get more than 10 bookmarks.
* Approximately 20% of works have no bookmarks (tail end).

```{r percentiles, message = FALSE}
wordsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Words', 'right')
hitsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Hits', 'right')
kudosPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Kudos', 'right')
commentsPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Comments', 'right')
bookmarksPercentiles <- plot_percentiles(percentileData, 'Works.Percentile', 'Bookmarks', 'right')

plot_grid(wordsPercentiles + theme(legend.position="none"),
          hitsPercentiles + theme(legend.position="none"),
          kudosPercentiles + theme(legend.position="none"),
          commentsPercentiles + theme(legend.position="none"),
          bookmarksPercentiles + theme(legend.position="none"),
          get_legend(kudosPercentiles +
                     theme(legend.title=element_blank())))

rm(total, percentile, percentileData, wordsPercentiles, hitsPercentiles, kudosPercentiles, commentsPercentiles, bookmarksPercentiles)

```

# Complete Work vs Work in Progress distributions

* There are approximately 4 times as many Complete Works as there are Works in Progress.
* Works in Progress are approximately 4 times longer than Complete ones.
* Works in Progress get approximately twice as many hits as Complete ones.
* Works in Progress get approximately 25% more kudos than Complete ones.
* Works in Progress get approximately 3 times as many comments as Complete ones (however there's no way to filter out author's comments in the search. selection, so this statistic should be taken with a grain of salt).
* Works in Progress get approximately twice as many bookmarks as Complete ones.

```{r totalWorksWIP, message = FALSE, warning=FALSE, fig.width=8, fig.height=6}

statsWIP <- stats
statsWIP$Divisor <- unlist(lapply(statsWIP$WIP, function(x) summary(statsWIP$WIP)[names(summary(statsWIP$WIP)) == x]))
statsWIP$Words.per.Work <- statsWIP$Words/statsWIP$Divisor
statsWIP$Hits.per.Work <- statsWIP$Hits/statsWIP$Divisor
statsWIP$Kudos.per.Work <- statsWIP$Kudos/statsWIP$Divisor
statsWIP$Comments.per.Work <- statsWIP$Comments/statsWIP$Divisor
statsWIP$Bookmarks.per.Work <- statsWIP$Bookmarks/statsWIP$Divisor

barWorksWIP <- plot_bar(statsWIP, 'WIP', 'right')
barWordsWIP <- plot_col(statsWIP, 'WIP', 'Words.per.Work', 'right')
barHitsWIP <- plot_col(statsWIP, 'WIP', 'Hits.per.Work', 'right')
barKudosWIP <- plot_col(statsWIP, 'WIP', 'Kudos.per.Work', 'right')
barCommentsWIP <- plot_col(statsWIP, 'WIP', 'Comments.per.Work', 'right')
barBookmarksWIP <- plot_col(statsWIP, 'WIP', 'Bookmarks.per.Work', 'right')

# plot_grid(plot_grid( barWorksWIP + theme(legend.position="none"),
#                      barWordsWIP + theme(legend.position="none"),
#                      barHitsWIP + theme(legend.position="none"),
#                      barKudosWIP + theme(legend.position="none"),
#                      barCommentsWIP + theme(legend.position="none"),
#                      barBookmarksWIP + theme(legend.position="none"),
#                      align = 'hv'),
#           get_legend(barWorksWIP + theme(legend.title=element_blank())),
#           rel_widths = c(4,1),
#           align = 'hv')
plot_grid( barWorksWIP + theme(legend.position="none"),
           barWordsWIP + theme(legend.position="none"),
           barHitsWIP + theme(legend.position="none"),
           barKudosWIP + theme(legend.position="none"),
           barCommentsWIP + theme(legend.position="none"),
           barBookmarksWIP + theme(legend.position="none"),
           align = 'hv')

rm(statsWIP, barWorksWIP, barWordsWIP, barHitsWIP, barKudosWIP, barCommentsWIP, barBookmarksWIP)
```


```{r statsDensitiesWIP, message = FALSE, eval=FALSE, warning=FALSE}

wordsDensityWIP <- plot_density(stats, 'Words', 'WIP', 'right')
hitsDensityWIP <- plot_density(stats, 'Hits', 'WIP', 'right')
kudosDensityWIP <- plot_density(stats, 'Kudos', 'WIP', 'right')
commentsDensityWIP <- plot_density(stats, 'Comments', 'WIP', 'right')
bookmarksDensityWIP <- plot_density(stats, 'Bookmarks', 'WIP', 'right')
  
plot_grid(wordsDensityWIP + theme(legend.position="none"),
          hitsDensityWIP + theme(legend.position="none"),
          kudosDensityWIP + theme(legend.position="none"),
          commentsDensityWIP + theme(legend.position="none"),
          bookmarksDensityWIP + theme(legend.position="none"),
          get_legend(kudosDensityWIP +
                     theme(legend.title=element_blank())))

rm(wordsDensityWIP, hitsDensityWIP, kudosDensityWIP, commentsDensityWIP, bookmarksDensityWIP)
```

```{r statsWordsWIP, message = FALSE, eval=FALSE, warning=FALSE}
wordsHitsWIP <- plot_points(stats, 'Words', 'Hits', 'WIP', 'right')
wordsHitsWIP
wordsKudosWIP <- plot_points(stats, 'Words', 'Kudos', 'WIP', 'right')
wordsKudosWIP
wordsCommentsWIP <- plot_points(stats, 'Words', 'Comments', 'WIP', 'right')
wordsCommentsWIP
wordsBookmarksWIP <- plot_points(stats, 'Words', 'Bookmarks', 'WIP', 'right')
wordsBookmarksWIP

rm(wordsHitsWIP, wordsKudosWIP, wordsCommentsWIP, wordsBookmarksWIP)
```

# Rating distributions

* Works rated G make up the most numerous category (~1300), but they are on average the shortest (~2500 words) and get fewest hits(>1000), and fewest comments (~10).
* Works rated T follow in numbers (~900), but are significantly longer (~9000 words).
* Works rated M make for the smallest category (tied with Not Rated, at ~ 200), but are the longest (~13k words).
* Works rated E are few (~350), longer than G but shorter than T rated works (~6000 words), and get the most hits, but fewer kudos and significantly less comments than T and M rated works.


```{r totalWorksRating, message = FALSE, warning=FALSE, fig.width=8, fig.height=8}

statsRating <- stats
statsRating$Divisor <- unlist(lapply(statsRating$Rating, function(x) summary(statsRating$Rating)[names(summary(statsRating$Rating)) == x]))
statsRating$Words.per.Work <- statsRating$Words/statsRating$Divisor
statsRating$Hits.per.Work <- statsRating$Hits/statsRating$Divisor
statsRating$Kudos.per.Work <- statsRating$Kudos/statsRating$Divisor
statsRating$Comments.per.Work <- statsRating$Comments/statsRating$Divisor
statsRating$Bookmarks.per.Work <- statsRating$Bookmarks/statsRating$Divisor

barWorksRating <- plot_bar(statsRating, 'Rating', 'right')
barWordsRating <- plot_col(statsRating, 'Rating', 'Words.per.Work', 'right')
barHitsRating <- plot_col(statsRating, 'Rating', 'Hits.per.Work', 'right')
barKudosRating <- plot_col(statsRating, 'Rating', 'Kudos.per.Work', 'right')
barCommentsRating <- plot_col(statsRating, 'Rating', 'Comments.per.Work', 'right')
barBookmarksRating <- plot_col(statsRating, 'Rating', 'Bookmarks.per.Work', 'right')

plot_grid( barWorksRating + theme(legend.position="none"),
           barWordsRating + theme(legend.position="none"),
           barHitsRating + theme(legend.position="none"),
           barKudosRating + theme(legend.position="none"),
           barCommentsRating + theme(legend.position="none"),
           barBookmarksRating + theme(legend.position="none"),
           align = 'hv')

rm(statsRating, barWorksRating, barWordsRating, barHitsRating, barKudosRating, barCommentsRating, barBookmarksRating)
```

```{r statsDensitiesRating, message = FALSE, warning=FALSE, eval=FALSE}
wordsDensityRating <- plot_density(stats, 'Words', 'Rating', 'right')
hitsDensityRating <- plot_density(stats, 'Hits', 'Rating', 'right')
kudosDensityRating <- plot_density(stats, 'Kudos', 'Rating', 'right')
commentsDensityRating <- plot_density(stats, 'Comments', 'Rating', 'right')
bookmarksDensityRating <- plot_density(stats, 'Bookmarks', 'Rating', 'right')

plot_grid(wordsDensityRating + theme(legend.position="none"),
          hitsDensityRating + theme(legend.position="none"),
          kudosDensityRating + theme(legend.position="none"),
          commentsDensityRating + theme(legend.position="none"),
          bookmarksDensityRating + theme(legend.position="none"),
          get_legend(kudosDensityRating +
                     theme(legend.title=element_blank())))

rm(wordsDensityRating, hitsDensityRating, kudosDensityRating, commentsDensityRating, bookmarksDensityRating)
```


```{r statsWordsRating, message = FALSE, warning=FALSE, eval=FALSE}
wordsHitsRating <- plot_points(stats, 'Words', 'Hits', 'Rating', 'right')
wordsHitsRating
wordsKudosRating <- plot_points(stats, 'Words', 'Kudos', 'Rating', 'right')
wordsKudosRating
wordsCommentsRating <- plot_points(stats, 'Words', 'Comments', 'Rating', 'right')
wordsCommentsRating
wordsBookmarksRating <- plot_points(stats, 'Words', 'Bookmarks', 'Rating', 'right')
wordsBookmarksRating

rm(wordsHitsRating, wordsKudosRating, wordsCommentsRating, wordsBookmarksRating)

```

# Categories

There are `r length(category[unlist(lapply(category, function(x) length(x))) == 1])` works tagged with a single category, and `r length(category[unlist(lapply(category, function(x) length(x))) > 1])` tagged with 2 or more (up until all 6).

'F/M' is the most popular category, followed by 'M/M', 'Gen', and 'F/F'.

Multiple category fics strongly contribute towards 'F/M' count, then to 'M/M', 'Gen', and 'F/F', and only marginally to 'Multi' and 'Other'.

```{r categoriesBars, message = FALSE}

singleCategorySummary <- summary(as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1])))
singleCategorySummary <- data.frame(Category = names(singleCategorySummary),
                                    Number.of.Works = singleCategorySummary)
singleCategorySummary$Split <- "Single category"

multipleCategorySummary <- data.frame(Category = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'),
                              Number.of.Works = c(sum(grepl('Gen',category)),
                                                  sum(grepl('F/F',category)),
                                                  sum(grepl('F/M',category)),
                                                  sum(grepl('M/M',category)),
                                                  sum(grepl('Multi',category)),
                                                  sum(grepl('Other',category)),
                                                  sum(grepl('No category',category))) )
multipleCategorySummary$Split <- "All works"

categorySummary <- rbind(singleCategorySummary, multipleCategorySummary)
categorySummary$Category <- factor(categorySummary$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
categorySummary$Split <- factor(categorySummary$Split, levels = c("Single category", "All works"))

plotCategories <- ggplot(categorySummary, aes(x = Category, y = Number.of.Works)) + 
                  geom_col(alpha=1)+
                  theme_half_open() +
                  background_grid() +
                  facet_wrap(.~Split) +
                  theme(legend.title=element_blank(),
                        axis.title.x = element_blank(),
                        axis.text.x = element_text(angle = 90, vjust = 1, hjust=1))+
                  labs(y="Number of Works")
plotCategories

rm(singleCategorySummary, multipleCategorySummary, categorySummary, plotCategories)

```

# Engagement by a single category

For simplicity I'm only looking at works tagged with a single category here.

"Multi" seems to have most words, despite being a rather small category, and collects quite a bit of Hits and Comments. It's possible that a number of those works are collections of stories for many fandoms, which amplifies the number of Hits and Comments, but that requires further investigation.

```{r categoriesSingleEngagement, message = FALSE, warning = FALSE, fig.width=10, fig.height=6}

statsCategory <- stats[unlist(lapply(category, function(x) length(x))) == 1,]
statsCategory$Category <- as.factor(unlist(category[unlist(lapply(category, function(x) length(x))) == 1]))
statsCategory$Category <- factor(statsCategory$Category, levels = c('Gen', 'F/F', 'F/M', 'M/M', 'Multi', 'Other', 'No category'))
statsCategory$Divisor <- unlist(lapply(statsCategory$Category, function(x) summary(statsCategory$Category)[names(summary(statsCategory$Category)) == x]))
statsCategory$Words.per.Work <- statsCategory$Words/statsCategory$Divisor
statsCategory$Hits.per.Work <- statsCategory$Hits/statsCategory$Divisor
statsCategory$Kudos.per.Work <- statsCategory$Kudos/statsCategory$Divisor
statsCategory$Comments.per.Work <- statsCategory$Comments/statsCategory$Divisor
statsCategory$Bookmarks.per.Work <- statsCategory$Bookmarks/statsCategory$Divisor
statsCategory$Works.Percent <- 1/statsCategory$Divisor

barWorksCategory <- plot_bar_color(statsCategory, 'Category', 'Rating', 'right')
barWordsCategory <- plot_col_color(statsCategory, 'Category', 'Words.per.Work', 'Rating', 'right')
barHitsCategory <- plot_col_color(statsCategory, 'Category', 'Hits.per.Work', 'Rating', 'right')
barKudosCategory <- plot_col_color(statsCategory, 'Category', 'Kudos.per.Work', 'Rating', 'right')
barCommentsCategory <- plot_col_color(statsCategory, 'Category', 'Comments.per.Work', 'Rating', 'right')
barBookmarksCategory <- plot_col_color(statsCategory, 'Category', 'Bookmarks.per.Work','Rating', 'right')

plot_grid(plot_grid( barWorksCategory + theme(legend.position="none"),
           barWordsCategory + theme(legend.position="none"),
           barHitsCategory + theme(legend.position="none"),
           barKudosCategory + theme(legend.position="none"),
           barCommentsCategory + theme(legend.position="none"),
           barBookmarksCategory + theme(legend.position="none"),
           align = 'hv'),
          get_legend(barWorksCategory + theme(legend.title=element_blank())),
          rel_widths = c(4,1))

```

# Ratings percentages by a single category

Out of the 3 main shipping categories in absolute numbers "M/M" has most E rated works, and "F/F" has the least, but in percentages of total works "F/F" and "F/M" are distributed almost identically, while "M/M" skews more towards M and E rated works.

```{r categoriesSingleEngagementPercent, message = FALSE, warning = FALSE, fig.width=10, fig.height=6}

plotWorksCategoryNormalized <- plot_col_color(statsCategory, 'Rating', 'Works.Percent', 'Rating', 'none')+
                               scale_y_continuous(labels=scales::percent)+
                               facet_wrap(.~Category)
plotWorksCategoryNormalized

rm(barWorksCategory, barWordsCategory, barHitsCategory, barKudosCategory, barCommentsCategory, barBookmarksCategory, plotWorksCategoryNormalized)

```

# Single Category through time

Interestingly, season 1 sees a peak of 'Gen' category. Season 2 gives higher rise to 'M/M' (possibly related to Aaravos reveal and 'Aaravos/Viren (The Dragon Prince)' shipping) and 'F/M' (the rise of 'Callum/Rayla (The Dragon Prince)'?), and season 3 is followed by a high rise of 'F/F' (following the development of 'Amaya/Janai (The Dragon Prince)' relationship) and a modest 'M/M' peak ('Ethari/Runaan (The Dragon Prince)' due to Ethari finally getting official name?).

```{r singleCategoryTime, message = FALSE, fig.width=10, fig.height=6}

plotDatesCategoryDensity <- ggplot(statsCategory, aes(x = Date, col=Category)) + 
                    geom_density(alpha = 0.1)+
                    geom_vline(xintercept=seasons)+
                    scale_x_date(date_breaks="2 months")+
                    scale_color_manual(values = custompalette) +
                    theme_half_open() +
                    background_grid() +
                    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1),
                          legend.position = 'right')
plotDatesCategoryDensity

rm(plotDatesCategoryDensity)
```

# Most popular ship tags

Undeniably the most popular ship tag is "Callum/Rayla (The Dragon Prince)", scoring almost 4 times as many stories as the next most popular ship tag "Amaya/Janai (The Dragon Prince)".

```{r shipsHistogram, message = FALSE, fig.width=8, fig.height=6}

topList <- 8
topRelationshipsTable<- data.frame('Relationship' = names(summary(as.factor(unlist(relationships)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(relationships)))[1:topList])
row.names(topRelationshipsTable) <- c()
topRelationshipsTable <- topRelationshipsTable[order(topRelationshipsTable$Number.of.Stories, decreasing = TRUE),]
topRelationshipsTable$Relationship <- factor(as.character(topRelationshipsTable$Relationship), levels=as.character(topRelationshipsTable$Relationship))

relationshipsStats <- data.frame(Date = stats$Date,
                                 relationship1 = rep(0, length(stats$Date)),
                                 relationship2 = rep(0, length(stats$Date)),
                                 relationship3 = rep(0, length(stats$Date)),
                                 relationship4 = rep(0, length(stats$Date)),
                                 relationship5 = rep(0, length(stats$Date)),
                                 relationship6 = rep(0, length(stats$Date)),
                                 relationship7 = rep(0, length(stats$Date)),
                                 relationship8 = rep(0, length(stats$Date)))
for (i in 1:topList){
  matchingVector <- lapply(relationships, match, table=as.character(topRelationshipsTable$Relationship[i]))
  matchingVector <- unlist(lapply(matchingVector, sum, na.rm=TRUE))
  relationshipsStats[i+1] <- matchingVector
}

#colnames(relationshipsStats)[2:9] <- gsub('/', '\\/', topRelationshipsTable$Relationship)

plotLegendRelationships <- ggplot(topRelationshipsTable, aes(x=Relationship, y=Number.of.Stories, fill=Relationship))+
  geom_col(alpha=0.7)+
  scale_fill_manual(values = custompalette)+
  theme_half_open() +
  background_grid() +
  labs(x="",y='Number of Stories')+
  theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))
plotLegendRelationships+theme(legend.position = 'none')
```

# Ship tags through time

"Amaya/Janai (The Dragon Prince)" sharply took of in popularity after season 3. Ethari's name hasn't been revealed until season 3, so authors were using "Runaan/Tinker | Necklace Elf (The Dragon Prince)" instead of "Ethari/Runaan (The Dragon Prince)", but Ao3 would consider them synonymous. Romance tag "Callum/Rayla (The Dragon Prince)" is significantly more popular than platonic/frienship tag "Callum & Rayla (The Dragon Prince)", however it's worth noting that out of `r topRelationshipsTable[topRelationshipsTable$Relationship == "Callum & Rayla (The Dragon Prince)",]$Number.of.Stories` stories tagged with a friendship/platonic one `r sum((relationshipsStats$relationship1+relationshipsStats$relationship3)>1)` are tagged with both, possibly making specifically friendship/platonic content more difficult to find.

```{r shipTagsTime, message = FALSE, fig.width=12, fig.height=6}

plotRelationships <- ggplot() +
    geom_density(data = relationshipsStats[relationshipsStats$relationship1 > 0,], mapping=aes(x = Date), colour=custompalette[1])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship2 > 0,], mapping=aes(x = Date), colour=custompalette[2])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship3 > 0,], mapping=aes(x = Date), colour=custompalette[3])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship4 > 0,], mapping=aes(x = Date), colour=custompalette[4])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship5 > 0,], mapping=aes(x = Date), colour=custompalette[5])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship6 > 0,], mapping=aes(x = Date), colour=custompalette[6])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship7 > 0,], mapping=aes(x = Date), colour=custompalette[7])+
    geom_density(data = relationshipsStats[relationshipsStats$relationship8 > 0,], mapping=aes(x = Date), colour=custompalette[8])+
    geom_vline(xintercept=seasons)+
    scale_x_date(date_breaks="2 months")+
    scale_color_manual(values = custompalette) +
    theme_half_open() +
    background_grid() +
    theme(axis.text.x = element_text(angle = 90, vjust = 0.5, hjust=1))

mylegend <- get_legend(plotLegendRelationships)

plot_grid(plotRelationships, mylegend,
          rel_widths = c(2,1), nrow=1)
#plotRelationships

#rm(seasons, plotDatesRatingDensity)
```

# Archive Warnings

Majority of works are tagged with "No Archive Warnings Apply", followed by a sizable fraction of "Creator Chose Not To Use Archive Warnings". It seems to be a common matter of confusion between the usage of those two warnings, so it's possible that a lot of "Creator Chose Not To Use Archive Warnings" are mistagged "No Archive Warnings Apply".

```{r warningBars, message = FALSE, fig.width=6, fig.height=6}

multipleWarningSummary <- data.frame(Warning = c("No Archive Warnings Apply",
                                                  "Graphic Depictions Of Violence",
                                                  "Major Character Death",
                                                  "Rape/Non-Con",
                                                  "Underage",
                                                  "Creator Chose Not To Use Archive Warnings"),
                              Number.of.Works = c(sum(grepl("No Archive Warnings Apply",warnings)),
                                                  sum(grepl("Graphic Depictions Of Violence",warnings)),
                                                  sum(grepl("Major Character Death",warnings)),
                                                  sum(grepl("Rape/Non-Con",warnings)),
                                                  sum(grepl("Underage",warnings)),
                                                  sum(grepl("Creator Chose Not To Use Archive Warnings",warnings))) )

multipleWarningSummary$Warning <- factor(multipleWarningSummary$Warning, levels = c("No Archive Warnings Apply",
                                                                                    "Graphic Depictions Of Violence",
                                                                                    "Major Character Death",
                                                                                    "Rape/Non-Con",
                                                                                    "Underage",
                                                                                    "Creator Chose Not To Use Archive Warnings"))

plotWarnings <- plot_col(multipleWarningSummary, 'Warning', 'Number.of.Works', 'right')
plotWarnings

rm(multipleWarningSummary, plotWarnings)

```

# Multiple Fandoms

```{r multipleFandoms, message = FALSE}

multiFandoms <- fandom[category == "Multi"]

multiFandomsAll <- fandom[grepl('Multi', category)]

severalFandoms <- fandom[unlist(lapply(fandom, length)) > 1]

crossovers <- fandom[grep('crossover',freeform, ignore.case=TRUE)]

```

Total number of works tagged only as 'Multi' is `r length(multiFandoms)`, but only `r sum(lapply(multiFandoms, length) > 1)` are tagged with more than one fandom. Among these, median number of fandoms tagged is `r summary( unlist(lapply(multiFandoms[lapply(multiFandoms, length) > 1], length)) )[3]`, and at least one work is tagged with `r summary( unlist(lapply(multiFandoms[lapply(multiFandoms, length) > 1], length)) )[6]`.

Number of works tagged with 'Multi' and/or other categories is `r length(multiFandomsAll)`, but only `r sum(lapply(multiFandomsAll, length) > 1)` are tagged with more than one fandom. Among these, median number of fandoms tagged is `r summary( unlist(lapply(multiFandomsAll[lapply(multiFandomsAll, length) > 1], length)) )[3]`, and at least one work is tagged with `r summary( unlist(lapply(multiFandomsAll[lapply(multiFandomsAll, length) > 1], length)) )[6]`.

Number of works tagged with more than 1 fandom is `r length(severalFandoms)`, however in some cases fandom tags used by the author are synonymous with `r tagValue`, for example "The Dragon Prince", "rayllum - Fandom", "rayla x callum", "TDP - Fandom", "callum x rayla".

Number of works explicitly tagged as 'crossover' is lower: `r length(crossovers)`. Out of multiple fandom tag works a significant amount are tagged with 2: `r length(fandom[unlist(lapply(fandom, length)) == 2])`, which, upon inspection, don't seem overwhelmingly synonymous, so perhaps some authors simply don't tag for crossovers.

# Authors by Works

Top 30 of most prolific authors in the tag by the number of stories as of data collection date:

```{r authorsWorks, message = FALSE}
topList <- 30

AuthorTable <- data.frame('Author' = names(summary(as.factor(unlist(author)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(author)))[1:topList])
row.names(AuthorTable) <- c()

kable(AuthorTable,
      col.names = c('Author', 'Number of Stories'))

rm(AuthorTable)
```

# Authors by Words

Only `r sum(unlist(lapply(author, length))>1)` works have more than one author. In cases where works had more than one author, I assumed that each of them contributed an equal amounts of words.

Top 30 of most prolific authors in the tag by the number of words written as of data collection date:

```{r authorsWords, message = FALSE}

wordsByAuthor <- c()

for (i in 1:length(words)){
  if (length(author[[i]]) > 1) {
    wordsByAuthor <- c(wordsByAuthor, rep(words[[i]]/length(author[[5]]), length(author[[i]]) ) )
  } else {
    wordsByAuthor <- c(wordsByAuthor, words[[i]])
  }
}

AuthorWordsTable <- data.frame('Author' = as.factor(unlist(author)),
                               'Words' = wordsByAuthor)

AuthorWordsSummary <- ddply(AuthorWordsTable, .(Author), 
                            summarize, 
                            Total.Words = sum(Words))
AuthorWordsSummary <- AuthorWordsSummary[order(AuthorWordsSummary$Total.Words, decreasing = TRUE),]
row.names(AuthorWordsSummary) <- c()

topList <- 30

kable(AuthorWordsSummary[1:topList,],
      col.names = c('Author', 'Total Words'))

rm(wordsByAuthor, i, AuthorWordsTable, AuthorWordsSummary)
```

# Characters

Top 30 of the most popular characters:

```{r characters, message = FALSE}
topList <- 30
CharacterTable<- data.frame('Character' = names(summary(as.factor(unlist(character)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(character)))[1:topList])
row.names(CharacterTable) <- c()

kable(CharacterTable,
      col.names = c('Character', 'Number of Stories'))

rm(CharacterTable)
```

# Relationships

Top 30 of the most popular relationships:

I don't have access to Ao3's system of synonymous tags, so by virtue of text processing some relationship tags here are repeated. 

Overwhelmingly, "Callum/Rayla (The Dragon Prince)" is the most popular relationship in TDP. They are followed by "Ethari/Runaan (The Dragon Prince)", which is not immediately obvious due to common use of synonymous tags such as "Runaan/Tinker | Necklace Elf (The Dragon Prince)" and "Runaan/Ethari". Third most popular relationship is "Amaya/Janai (The Dragon Prince)".

```{r relationships, message = FALSE}
topList <- 30
RelationshipsTable<- data.frame('Relationship' = names(summary(as.factor(unlist(relationships)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(relationships)))[1:topList])
row.names(RelationshipsTable) <- c()

kable(RelationshipsTable,
      col.names = c('Relationship', 'Number of Stories'))

rm(RelationshipsTable)
```

# Freeform tags

Top 30 of the most popular freeform tags

```{r freeform, message = FALSE}
topList <- 30
FreeformTable<- data.frame('Freeform' = names(summary(as.factor(unlist(freeform)))[1:topList]),
                          'Number of Stories' = summary(as.factor(unlist(freeform)))[1:topList])
row.names(FreeformTable) <- c()

kable(FreeformTable,
      col.names = c('Freeform Tag', 'Number of Stories'))

rm(FreeformTable)
```

# Languages

Unsurprisingly, most works are written in English. Apologies for U+. kable package for whatever reason murders unicode characters. The two languages in question are Russian (Русский) and Chinese (中文).

```{r languages, message = FALSE}
#topList <- 30

languagesList <- summary(as.factor(unlist(language)))

LanguageTable <- data.frame('Language' = names(languagesList),
                            'Number of Stories' = languagesList )
LanguageTable <- LanguageTable[order(LanguageTable$Number.of.Stories, decreasing=TRUE),]
row.names(LanguageTable) <- c()

kable(LanguageTable,
      col.names = c('Language', 'Number of Stories'))

#languagesList

#rm(LanguageTable)
```

# Other links

Ao3 data analysis for The Dragon Prince (Cartoon)

[Ao3 data analysis for Avatar: Legend of Korra](https://darthaline.github.io/Ao3SearchAnalysis/fandoms/LOK/LOK_processing_notebook.nb.html)

[Ao3 data analysis for Avatar: The Last Airbender](https://darthaline.github.io/Ao3SearchAnalysis/fandoms/ATLA/ATLA_processing_notebook.nb.html)

[Ao3 data analysis for Black Sails](https://darthaline.github.io/Ao3SearchAnalysis/fandoms/BSails/BSails_processing_notebook.nb.html)

If you enjoyed my analysis, please, consider [buying me a coffee](https://ko-fi.com/D1D8RIG5) or some other beverage.