at main 6.5 kB view raw
1# install.packages("tidyverse") 2library(tibble) # tibble (comes from tidyverse) 3# install.packages("word2vec") 4library(word2vec) # read.wordvectors 5# install.packages("reticulate") 6library(reticulate) # reticulate::py_eval 7# install.packages("tidytext") 8library(tidytext) # data("stop_words") 9# install.packages("dplyr") 10library(dplyr) # anti_join 11# install.packages("SnowballC") 12library(SnowballC) # wordStem 13# install.packages("gglot2") 14library(ggplot2) # ggplot 15library(viridis) # scale_viridis_d 16library(tidyverse) # ???? 17 18 19load("data/stem_AMS.Rda") 20stem_count_ams <- stem_count 21load("data/stem_CBMS.Rda") 22stem_count_cbms <- stem_count 23load("data/stem_IPEDS.Rda") 24stem_count_ipeds <- stem_count 25 26data_sources <- list(stem_count_ams,stem_count_cbms,stem_count_ipeds) 27names <- list("AMS", "CBMS", "IPEDS") 28i <- 1 29pdf("plots_stem.pdf", onefile = TRUE) 30for (data in data_sources) { 31 data <- data[complete.cases(data),] 32 print( 33 ggplot(data, aes(x = reorder(word, n), y = n)) + 34 geom_col() + 35 coord_flip() + 36 labs( 37 title = paste("Most frequent words in", names[[i]], "surveys"), 38 x = "Occurrences", 39 y = "Words" 40 ) + 41 theme_minimal() + 42 theme(aspect.ratio = 1) 43 ) 44 i <- i + 1 45} 46dev.off() 47 48load("data/glove_100d_0.219_AMS.Rda") 49sema_count_glove_100d_ams <- sema_count 50load("data/glove_100d_0.219_CBMS.Rda") 51sema_count_glove_100d_cbms <- sema_count 52load("data/glove_100d_0.219_IPEDS.Rda") 53sema_count_glove_100d_ipeds <- sema_count 54load("data/glove_200d_0.164_AMS.Rda") 55sema_count_glove_200d_ams <- sema_count 56load("data/glove_200d_0.164_CBMS.Rda") 57sema_count_glove_200d_cbms <- sema_count 58load("data/glove_200d_0.164_IPEDS.Rda") 59sema_count_glove_200d_ipeds <- sema_count 60load("data/glove_300d_0.134_AMS.Rda") 61sema_count_glove_300d_ams <- sema_count 62load("data/glove_300d_0.134_CBMS.Rda") 63sema_count_glove_300d_cbms <- sema_count 64load("data/glove_300d_0.134_IPEDS.Rda") 65sema_count_glove_300d_ipeds <- sema_count 66load("data/glove_50d_0.273_AMS.Rda") 67sema_count_glove_50d_ams <- sema_count 68load("data/glove_50d_0.273_CBMS.Rda") 69sema_count_glove_50d_cbms <- sema_count 70load("data/glove_50d_0.273_IPEDS.Rda") 71sema_count_glove_50d_ipeds <- sema_count 72load("data/google_news_0.185_AMS.Rda") 73sema_count_google_ams <- sema_count 74load("data/google_news_0.185_CBMS.Rda") 75sema_count_google_cbms <- sema_count 76load("data/google_news_0.185_IPEDS.Rda") 77sema_count_google_ipeds <- sema_count 78 79get_df <- function(str) { 80 data.frame( 81 model = list("GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News") |> as.character(), 82 title = list("IPEDS", "IPEDS", "IPEDS", "IPEDS", "IPEDS", "CBMS", "CBMS", "CBMS", "CBMS", "CBMS", "AMS", "AMS", "AMS", "AMS", "AMS") |> as.character(), 83 n = list( 84 sema_count_glove_50d_ipeds[sema_count_glove_50d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 85 sema_count_glove_300d_ipeds[sema_count_glove_300d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 86 sema_count_glove_200d_ipeds[sema_count_glove_200d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 87 sema_count_glove_100d_ipeds[sema_count_glove_100d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 88 sema_count_google_ipeds[sema_count_google_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE), 89 sema_count_glove_50d_cbms[sema_count_glove_50d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 90 sema_count_glove_300d_cbms[sema_count_glove_300d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 91 sema_count_glove_200d_cbms[sema_count_glove_200d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 92 sema_count_glove_100d_cbms[sema_count_glove_100d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 93 sema_count_google_cbms[sema_count_google_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE), 94 sema_count_glove_50d_ams[sema_count_glove_50d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 95 sema_count_glove_300d_ams[sema_count_glove_300d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 96 sema_count_glove_200d_ams[sema_count_glove_200d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 97 sema_count_glove_100d_ams[sema_count_glove_100d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE), 98 sema_count_google_ams[sema_count_google_ams $ word_category == str,] $ n |> sum(na.rm = TRUE) 99 ) |> as.numeric() 100 ) 101} 102 103# ------------------------------ lgbt ------------------------------ 104 105lgbt_df <- get_df("lgbt") 106 107race_ethn_df <- get_df("race/ethnicity") 108 109women_df <- get_df("women") 110 111disabilities_df <- get_df("disabilities") 112 113i <- 1 114tags <- list("LGBT", "Race/Ethnicity", "Women", "Disabilities") |> as.character() 115pdf("tags.pdf") 116for (df in list(lgbt_df, race_ethn_df, women_df, disabilities_df)) { 117 tag_name <- tags[i] 118 print( 119 ggplot(df, aes(fill = model, x = title, y = n)) + 120 geom_bar(position = "dodge", stat = "identity") + 121 scale_fill_viridis_d() + 122 labs( 123 x = "Data Source", 124 y = "Occurrences" 125 ) + 126 theme_light() + 127 theme(aspect.ratio = 1) 128 ) 129 i <- i + 1 130} 131dev.off() 132 133# women 134# race/ethnicity 135# disabilities 136 137data_sources <- list(sema_count_glove_100d_ams,sema_count_glove_100d_cbms,sema_count_glove_100d_ipeds,sema_count_glove_200d_ams,sema_count_glove_200d_cbms,sema_count_glove_200d_ipeds,sema_count_glove_300d_ams,sema_count_glove_300d_cbms,sema_count_glove_300d_ipeds,sema_count_glove_50d_ams,sema_count_glove_50d_cbms,sema_count_glove_50d_ipeds,sema_count_google_ams,sema_count_google_cbms,sema_count_google_ipeds) 138names <- list( 139 "AMS surveys w/ 100-D GloVe model", 140 "CBMS surveys w/ 100-D GloVe model", 141 "IPEDS surveys w/ 100-D GloVe model", 142 "AMS surveys w/ 200-D GloVe model", 143 "CBMS surveys w/ 200-D GloVe model", 144 "IPEDS surveys w/ 200-D GloVe model", 145 "AMS surveys w/ 300-D GloVe model", 146 "CBMS surveys w/ 300-D GloVe model", 147 "IPEDS surveys w/ 300-D GloVe model", 148 "AMS surveys w/ 50-D GloVe model", 149 "CBMS surveys w/ 50-D GloVe model", 150 "IPEDS surveys w/ 50-D GloVe model", 151 "AMS surveys w/ Google News model", 152 "CBMS surveys w/ Google News model", 153 "IPEDS surveys w/ Google News model" 154) 155 156i <- 1 157pdf("plots_sema.pdf", onefile = TRUE) 158for (data in data_sources) { 159 data <- data[complete.cases(data),] 160 print( 161 ggplot(data) + 162 geom_bar(aes(y = word_category, fill = word_category)) + 163 labs( 164 title = paste("Most frequent tags in", names[[i]]), 165 x = "Occurrences", 166 y = "Tags" 167 ) + 168 theme_minimal() + 169 theme(aspect.ratio = 1) 170 ) 171 i <- i + 1 172} 173dev.off()