code/variable_analysis.R at main · stau.space/underrepresentation-theory

this repo has no description
underrepresentation-theory / code / variable_analysis.R
at main 6.5 kB view raw
  1# install.packages("tidyverse")
  2library(tibble) # tibble (comes from tidyverse)
  3# install.packages("word2vec")
  4library(word2vec) # read.wordvectors
  5# install.packages("reticulate")
  6library(reticulate) # reticulate::py_eval
  7# install.packages("tidytext")
  8library(tidytext) # data("stop_words")
  9# install.packages("dplyr")
 10library(dplyr) # anti_join
 11# install.packages("SnowballC")
 12library(SnowballC) # wordStem
 13# install.packages("gglot2")
 14library(ggplot2) # ggplot
 15library(viridis) # scale_viridis_d
 16library(tidyverse) # ????
 17
 18
 19load("data/stem_AMS.Rda")
 20stem_count_ams <- stem_count
 21load("data/stem_CBMS.Rda")
 22stem_count_cbms <- stem_count
 23load("data/stem_IPEDS.Rda")
 24stem_count_ipeds <- stem_count
 25
 26data_sources <- list(stem_count_ams,stem_count_cbms,stem_count_ipeds)
 27names <- list("AMS", "CBMS", "IPEDS")
 28i <- 1
 29pdf("plots_stem.pdf", onefile = TRUE)
 30for (data in data_sources) {
 31	data <- data[complete.cases(data),]
 32	print(
 33		ggplot(data, aes(x = reorder(word, n), y = n)) +
 34		geom_col() +
 35		coord_flip() +
 36		labs(
 37			title = paste("Most frequent words in", names[[i]], "surveys"),
 38			x = "Occurrences",
 39			y = "Words"
 40		) +
 41		theme_minimal() +
 42		theme(aspect.ratio = 1)
 43	)
 44	i <- i + 1
 45}
 46dev.off()
 47
 48load("data/glove_100d_0.219_AMS.Rda")
 49sema_count_glove_100d_ams <- sema_count
 50load("data/glove_100d_0.219_CBMS.Rda")
 51sema_count_glove_100d_cbms <- sema_count
 52load("data/glove_100d_0.219_IPEDS.Rda")
 53sema_count_glove_100d_ipeds <- sema_count
 54load("data/glove_200d_0.164_AMS.Rda")
 55sema_count_glove_200d_ams <- sema_count
 56load("data/glove_200d_0.164_CBMS.Rda")
 57sema_count_glove_200d_cbms <- sema_count
 58load("data/glove_200d_0.164_IPEDS.Rda")
 59sema_count_glove_200d_ipeds <- sema_count
 60load("data/glove_300d_0.134_AMS.Rda")
 61sema_count_glove_300d_ams <- sema_count
 62load("data/glove_300d_0.134_CBMS.Rda")
 63sema_count_glove_300d_cbms <- sema_count
 64load("data/glove_300d_0.134_IPEDS.Rda")
 65sema_count_glove_300d_ipeds <- sema_count
 66load("data/glove_50d_0.273_AMS.Rda")
 67sema_count_glove_50d_ams <- sema_count
 68load("data/glove_50d_0.273_CBMS.Rda")
 69sema_count_glove_50d_cbms <- sema_count
 70load("data/glove_50d_0.273_IPEDS.Rda")
 71sema_count_glove_50d_ipeds <- sema_count
 72load("data/google_news_0.185_AMS.Rda")
 73sema_count_google_ams <- sema_count
 74load("data/google_news_0.185_CBMS.Rda")
 75sema_count_google_cbms <- sema_count
 76load("data/google_news_0.185_IPEDS.Rda")
 77sema_count_google_ipeds <- sema_count
 78
 79get_df <- function(str) {
 80	data.frame(
 81		model = list("GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News") |> as.character(),
 82		title = list("IPEDS", "IPEDS", "IPEDS", "IPEDS", "IPEDS", "CBMS", "CBMS", "CBMS", "CBMS", "CBMS", "AMS", "AMS", "AMS", "AMS", "AMS") |> as.character(),
 83		n = list(
 84			sema_count_glove_50d_ipeds[sema_count_glove_50d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
 85			sema_count_glove_300d_ipeds[sema_count_glove_300d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
 86			sema_count_glove_200d_ipeds[sema_count_glove_200d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
 87			sema_count_glove_100d_ipeds[sema_count_glove_100d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
 88			sema_count_google_ipeds[sema_count_google_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
 89			sema_count_glove_50d_cbms[sema_count_glove_50d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
 90			sema_count_glove_300d_cbms[sema_count_glove_300d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
 91			sema_count_glove_200d_cbms[sema_count_glove_200d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
 92			sema_count_glove_100d_cbms[sema_count_glove_100d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
 93			sema_count_google_cbms[sema_count_google_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
 94			sema_count_glove_50d_ams[sema_count_glove_50d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
 95			sema_count_glove_300d_ams[sema_count_glove_300d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
 96			sema_count_glove_200d_ams[sema_count_glove_200d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
 97			sema_count_glove_100d_ams[sema_count_glove_100d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
 98			sema_count_google_ams[sema_count_google_ams $ word_category == str,] $ n |> sum(na.rm = TRUE)
 99		) |> as.numeric()
100	)
101}
102
103# ------------------------------ lgbt ------------------------------
104
105lgbt_df <- get_df("lgbt")
106
107race_ethn_df <- get_df("race/ethnicity")
108
109women_df <- get_df("women")
110
111disabilities_df <- get_df("disabilities")
112
113i <- 1
114tags <- list("LGBT", "Race/Ethnicity", "Women", "Disabilities") |> as.character()
115pdf("tags.pdf")
116for (df in list(lgbt_df, race_ethn_df, women_df, disabilities_df)) {
117	tag_name <- tags[i]
118	print(
119		ggplot(df, aes(fill = model, x = title, y = n)) +
120			geom_bar(position = "dodge", stat = "identity") +
121			scale_fill_viridis_d() +
122			labs(
123				x = "Data Source",
124				y = "Occurrences"
125			) +
126			theme_light() +
127			theme(aspect.ratio = 1)
128	)
129	i <- i + 1
130}
131dev.off()
132
133# women
134# race/ethnicity
135# disabilities
136
137data_sources <- list(sema_count_glove_100d_ams,sema_count_glove_100d_cbms,sema_count_glove_100d_ipeds,sema_count_glove_200d_ams,sema_count_glove_200d_cbms,sema_count_glove_200d_ipeds,sema_count_glove_300d_ams,sema_count_glove_300d_cbms,sema_count_glove_300d_ipeds,sema_count_glove_50d_ams,sema_count_glove_50d_cbms,sema_count_glove_50d_ipeds,sema_count_google_ams,sema_count_google_cbms,sema_count_google_ipeds)
138names <- list(
139	"AMS surveys w/ 100-D GloVe model",
140	"CBMS surveys w/ 100-D GloVe model",
141	"IPEDS surveys w/ 100-D GloVe model",
142	"AMS surveys w/ 200-D GloVe model",
143	"CBMS surveys w/ 200-D GloVe model",
144	"IPEDS surveys w/ 200-D GloVe model",
145	"AMS surveys w/ 300-D GloVe model",
146	"CBMS surveys w/ 300-D GloVe model",
147	"IPEDS surveys w/ 300-D GloVe model",
148	"AMS surveys w/ 50-D GloVe model",
149	"CBMS surveys w/ 50-D GloVe model",
150	"IPEDS surveys w/ 50-D GloVe model",
151	"AMS surveys w/ Google News model",
152	"CBMS surveys w/ Google News model",
153	"IPEDS surveys w/ Google News model"
154)
155
156i <- 1
157pdf("plots_sema.pdf", onefile = TRUE)
158for (data in data_sources) {
159	data <- data[complete.cases(data),]
160	print(
161		ggplot(data) +
162		geom_bar(aes(y = word_category, fill = word_category)) +
163		labs(
164			 title = paste("Most frequent tags in", names[[i]]),
165			 x = "Occurrences",
166			 y = "Tags"
167		) +
168		theme_minimal() +
169		theme(aspect.ratio = 1)
170	)
171	i <- i + 1
172}
173dev.off()