feat: New plots. · stau.space/underrepresentation-theory@29a8766

+43

code/synonym_plots.R

··· 1 + library(ggplot2) 2 + library(reshape2) 3 + library(viridis) 4 + 5 + data <- read.csv("data/synonym_thresholds.csv") 6 + mat <- data.frame( 7 + data $ min, 8 + data $ q1, 9 + data $ median, 10 + data $ mean, 11 + data $ q3, 12 + data $ max 13 + ) |> as.matrix() 14 + 15 + rownames(mat) <- list( 16 + "Google News", 17 + "GloVe 300-D", 18 + "GloVe 200-D", 19 + "GloVe 100-D", 20 + "GloVe 50-D" 21 + ) |> as.character() 22 + 23 + colnames(mat) <- list( 24 + "Minimum", 25 + "First Quartile", 26 + "Median", 27 + "Mean", 28 + "Third Quartile", 29 + "Max" 30 + ) 31 + 32 + ld <- melt(mat) 33 + ld <- ld[ld $ value != 0,] 34 + 35 + pdf("synonyms.pdf") 36 + ggplot(ld, aes(x = Var2, y = Var1)) + 37 + geom_raster(aes(fill = value)) + 38 + scale_fill_viridis_c() + 39 + geom_text(aes(label = value)) + 40 + labs(x = "Quartiles", y = "Models") + 41 + theme_light() + 42 + theme(aspect.ratio = 1) 43 + dev.off()

+93 -18

code/variable_analysis.R

··· 12 12 library(SnowballC) # wordStem 13 13 # install.packages("gglot2") 14 14 library(ggplot2) # ggplot 15 + library(viridis) # scale_viridis_d 16 + library(tidyverse) # ???? 15 17 16 18 17 19 load("data/stem_AMS.Rda") ··· 27 29 pdf("plots_stem.pdf", onefile = TRUE) 28 30 for (data in data_sources) { 29 31 data <- data[complete.cases(data),] 30 - print(ggplot(data, aes(x = reorder(word, n), y = n)) + 32 + print( 33 + ggplot(data, aes(x = reorder(word, n), y = n)) + 31 34 geom_col() + 32 35 coord_flip() + 33 - labs(title = paste("Most frequent words in", names[[i]], "surveys"))) 36 + labs( 37 + title = paste("Most frequent words in", names[[i]], "surveys"), 38 + x = "Occurrences", 39 + y = "Words" 40 + ) + 41 + theme_minimal() + 42 + theme(aspect.ratio = 1) 43 + ) 34 44 i <- i + 1 35 45 } 36 46 dev.off() ··· 66 76 load("data/google_sema_IPEDS.Rda") 67 77 sema_count_google_ipeds <- sema_count 68 78 79 + get_df <- function(str) { 80 + data.frame( 81 + model = list("GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News") |> as.character(), 82 + title = list("IPEDS", "IPEDS", "IPEDS", "IPEDS", "IPEDS", "CBMS", "CBMS", "CBMS", "CBMS", "CBMS", "AMS", "AMS", "AMS", "AMS", "AMS") |> as.character(), 83 + n = list( 84 + sema_count_glove_50d_ipeds[sema_count_glove_50d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 85 + sema_count_glove_300d_ipeds[sema_count_glove_300d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 86 + sema_count_glove_200d_ipeds[sema_count_glove_200d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 87 + sema_count_glove_100d_ipeds[sema_count_glove_100d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 88 + sema_count_google_ipeds[sema_count_google_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 89 + sema_count_glove_50d_cbms[sema_count_glove_50d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 90 + sema_count_glove_300d_cbms[sema_count_glove_300d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 91 + sema_count_glove_200d_cbms[sema_count_glove_200d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 92 + sema_count_glove_100d_cbms[sema_count_glove_100d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 93 + sema_count_google_cbms[sema_count_google_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 94 + sema_count_glove_50d_ams[sema_count_glove_50d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 95 + sema_count_glove_300d_ams[sema_count_glove_300d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 96 + sema_count_glove_200d_ams[sema_count_glove_200d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 97 + sema_count_glove_100d_ams[sema_count_glove_100d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 98 + sema_count_google_ams[sema_count_google_ams $ word_category == str,] $ n |> sum(na.rm = TRUE) 99 + ) |> as.numeric() 100 + ) 101 + } 102 + 103 + # ------------------------------ lgbt ------------------------------ 104 + 105 + lgbt_df <- get_df("lgbt") 106 + 107 + race_ethn_df <- get_df("race/ethnicity") 108 + 109 + women_df <- get_df("women") 110 + 111 + disabilities_df <- get_df("disabilities") 112 + 113 + i <- 1 114 + tags <- list("LGBT", "Race/Ethnicity", "Women", "Disabilities") |> as.character() 115 + pdf("tags.pdf") 116 + for (df in list(lgbt_df, race_ethn_df, women_df, disabilities_df)) { 117 + tag_name <- tags[i] 118 + print( 119 + ggplot(df, aes(fill = model, x = title, y = n)) + 120 + geom_bar(position = "dodge", stat = "identity") + 121 + scale_fill_viridis_d() + 122 + labs( 123 + x = "Data Source", 124 + y = "Occurrences" 125 + ) + 126 + theme_light() + 127 + theme(aspect.ratio = 1) 128 + ) 129 + i <- i + 1 130 + } 131 + dev.off() 132 + 133 + # women 134 + # race/ethnicity 135 + # disabilities 136 + 69 137 data_sources <- list(sema_count_glove_100d_ams,sema_count_glove_100d_cbms,sema_count_glove_100d_ipeds,sema_count_glove_200d_ams,sema_count_glove_200d_cbms,sema_count_glove_200d_ipeds,sema_count_glove_300d_ams,sema_count_glove_300d_cbms,sema_count_glove_300d_ipeds,sema_count_glove_50d_ams,sema_count_glove_50d_cbms,sema_count_glove_50d_ipeds,sema_count_google_ams,sema_count_google_cbms,sema_count_google_ipeds) 70 138 names <- list( 71 - "AMS surveys w/ 100-D Glove model", 72 - "CBMS surveys w/ 100-D Glove model", 73 - "IPEDS surveys w/ 100-D Glove model", 74 - "AMS surveys w/ 200-D Glove model", 75 - "CBMS surveys w/ 200-D Glove model", 76 - "IPEDS surveys w/ 200-D Glove model", 77 - "AMS surveys w/ 300-D Glove model", 78 - "CBMS surveys w/ 300-D Glove model", 79 - "IPEDS surveys w/ 300-D Glove model", 80 - "AMS surveys w/ 50-D Glove model", 81 - "CBMS surveys w/ 50-D Glove model", 82 - "IPEDS surveys w/ 50-D Glove model", 139 + "AMS surveys w/ 100-D GloVe model", 140 + "CBMS surveys w/ 100-D GloVe model", 141 + "IPEDS surveys w/ 100-D GloVe model", 142 + "AMS surveys w/ 200-D GloVe model", 143 + "CBMS surveys w/ 200-D GloVe model", 144 + "IPEDS surveys w/ 200-D GloVe model", 145 + "AMS surveys w/ 300-D GloVe model", 146 + "CBMS surveys w/ 300-D GloVe model", 147 + "IPEDS surveys w/ 300-D GloVe model", 148 + "AMS surveys w/ 50-D GloVe model", 149 + "CBMS surveys w/ 50-D GloVe model", 150 + "IPEDS surveys w/ 50-D GloVe model", 83 151 "AMS surveys w/ Google News model", 84 152 "CBMS surveys w/ Google News model", 85 153 "IPEDS surveys w/ Google News model" ··· 89 157 pdf("plots_sema.pdf", onefile = TRUE) 90 158 for (data in data_sources) { 91 159 data <- data[complete.cases(data),] 92 - print(ggplot(data, aes(x = reorder(tag, n), y = n)) + 93 - geom_col() + 94 - coord_flip() + 95 - labs(title = paste("Most frequent tags in", names[[i]]))) 160 + print( 161 + ggplot(data) + 162 + geom_bar(aes(y = word_category, fill = word_category)) + 163 + labs( 164 + title = paste("Most frequent tags in", names[[i]]), 165 + x = "Occurrences", 166 + y = "Tags" 167 + ) + 168 + theme_minimal() + 169 + theme(aspect.ratio = 1) 170 + ) 96 171 i <- i + 1 97 172 } 98 173 dev.off()

+2

flake.nix

··· 36 36 readr 37 37 remotes 38 38 resampledata 39 + reshape2 39 40 reticulate 40 41 rhdf5 41 42 rmarkdown ··· 55 56 units 56 57 word2vec 57 58 wordcloud 59 + viridis 58 60 ]; 59 61 myPythonPackages = with pkgs.python313Packages; [ 60 62 colorama

Configure Feed

Configure Feed