this repo has no description
1# install.packages("tidyverse")
2library(tibble) # tibble (comes from tidyverse)
3# install.packages("word2vec")
4library(word2vec) # read.wordvectors
5# install.packages("reticulate")
6library(reticulate) # reticulate::py_eval
7# install.packages("tidytext")
8library(tidytext) # data("stop_words")
9# install.packages("dplyr")
10library(dplyr) # anti_join
11# install.packages("SnowballC")
12library(SnowballC) # wordStem
13# install.packages("gglot2")
14library(ggplot2) # ggplot
15library(viridis) # scale_viridis_d
16library(tidyverse) # ????
17
18
19load("data/stem_AMS.Rda")
20stem_count_ams <- stem_count
21load("data/stem_CBMS.Rda")
22stem_count_cbms <- stem_count
23load("data/stem_IPEDS.Rda")
24stem_count_ipeds <- stem_count
25
26data_sources <- list(stem_count_ams,stem_count_cbms,stem_count_ipeds)
27names <- list("AMS", "CBMS", "IPEDS")
28i <- 1
29pdf("plots_stem.pdf", onefile = TRUE)
30for (data in data_sources) {
31 data <- data[complete.cases(data),]
32 print(
33 ggplot(data, aes(x = reorder(word, n), y = n)) +
34 geom_col() +
35 coord_flip() +
36 labs(
37 title = paste("Most frequent words in", names[[i]], "surveys"),
38 x = "Occurrences",
39 y = "Words"
40 ) +
41 theme_minimal() +
42 theme(aspect.ratio = 1)
43 )
44 i <- i + 1
45}
46dev.off()
47
48load("data/glove_100d_0.219_AMS.Rda")
49sema_count_glove_100d_ams <- sema_count
50load("data/glove_100d_0.219_CBMS.Rda")
51sema_count_glove_100d_cbms <- sema_count
52load("data/glove_100d_0.219_IPEDS.Rda")
53sema_count_glove_100d_ipeds <- sema_count
54load("data/glove_200d_0.164_AMS.Rda")
55sema_count_glove_200d_ams <- sema_count
56load("data/glove_200d_0.164_CBMS.Rda")
57sema_count_glove_200d_cbms <- sema_count
58load("data/glove_200d_0.164_IPEDS.Rda")
59sema_count_glove_200d_ipeds <- sema_count
60load("data/glove_300d_0.134_AMS.Rda")
61sema_count_glove_300d_ams <- sema_count
62load("data/glove_300d_0.134_CBMS.Rda")
63sema_count_glove_300d_cbms <- sema_count
64load("data/glove_300d_0.134_IPEDS.Rda")
65sema_count_glove_300d_ipeds <- sema_count
66load("data/glove_50d_0.273_AMS.Rda")
67sema_count_glove_50d_ams <- sema_count
68load("data/glove_50d_0.273_CBMS.Rda")
69sema_count_glove_50d_cbms <- sema_count
70load("data/glove_50d_0.273_IPEDS.Rda")
71sema_count_glove_50d_ipeds <- sema_count
72load("data/google_news_0.185_AMS.Rda")
73sema_count_google_ams <- sema_count
74load("data/google_news_0.185_CBMS.Rda")
75sema_count_google_cbms <- sema_count
76load("data/google_news_0.185_IPEDS.Rda")
77sema_count_google_ipeds <- sema_count
78
79get_df <- function(str) {
80 data.frame(
81 model = list("GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News", "GloVe 50D", "GloVe 100D", "GloVe 200D", "GloVe 300D", "Google News") |> as.character(),
82 title = list("IPEDS", "IPEDS", "IPEDS", "IPEDS", "IPEDS", "CBMS", "CBMS", "CBMS", "CBMS", "CBMS", "AMS", "AMS", "AMS", "AMS", "AMS") |> as.character(),
83 n = list(
84 sema_count_glove_50d_ipeds[sema_count_glove_50d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
85 sema_count_glove_300d_ipeds[sema_count_glove_300d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
86 sema_count_glove_200d_ipeds[sema_count_glove_200d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
87 sema_count_glove_100d_ipeds[sema_count_glove_100d_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
88 sema_count_google_ipeds[sema_count_google_ipeds $ word_category == str,] $ n |> sum(na.rm = TRUE),
89 sema_count_glove_50d_cbms[sema_count_glove_50d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
90 sema_count_glove_300d_cbms[sema_count_glove_300d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
91 sema_count_glove_200d_cbms[sema_count_glove_200d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
92 sema_count_glove_100d_cbms[sema_count_glove_100d_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
93 sema_count_google_cbms[sema_count_google_cbms $ word_category == str,] $ n |> sum(na.rm = TRUE),
94 sema_count_glove_50d_ams[sema_count_glove_50d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
95 sema_count_glove_300d_ams[sema_count_glove_300d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
96 sema_count_glove_200d_ams[sema_count_glove_200d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
97 sema_count_glove_100d_ams[sema_count_glove_100d_ams $ word_category == str,] $ n |> sum(na.rm = TRUE),
98 sema_count_google_ams[sema_count_google_ams $ word_category == str,] $ n |> sum(na.rm = TRUE)
99 ) |> as.numeric()
100 )
101}
102
103# ------------------------------ lgbt ------------------------------
104
105lgbt_df <- get_df("lgbt")
106
107race_ethn_df <- get_df("race/ethnicity")
108
109women_df <- get_df("women")
110
111disabilities_df <- get_df("disabilities")
112
113i <- 1
114tags <- list("LGBT", "Race/Ethnicity", "Women", "Disabilities") |> as.character()
115pdf("tags.pdf")
116for (df in list(lgbt_df, race_ethn_df, women_df, disabilities_df)) {
117 tag_name <- tags[i]
118 print(
119 ggplot(df, aes(fill = model, x = title, y = n)) +
120 geom_bar(position = "dodge", stat = "identity") +
121 scale_fill_viridis_d() +
122 labs(
123 x = "Data Source",
124 y = "Occurrences"
125 ) +
126 theme_light() +
127 theme(aspect.ratio = 1)
128 )
129 i <- i + 1
130}
131dev.off()
132
133# women
134# race/ethnicity
135# disabilities
136
137data_sources <- list(sema_count_glove_100d_ams,sema_count_glove_100d_cbms,sema_count_glove_100d_ipeds,sema_count_glove_200d_ams,sema_count_glove_200d_cbms,sema_count_glove_200d_ipeds,sema_count_glove_300d_ams,sema_count_glove_300d_cbms,sema_count_glove_300d_ipeds,sema_count_glove_50d_ams,sema_count_glove_50d_cbms,sema_count_glove_50d_ipeds,sema_count_google_ams,sema_count_google_cbms,sema_count_google_ipeds)
138names <- list(
139 "AMS surveys w/ 100-D GloVe model",
140 "CBMS surveys w/ 100-D GloVe model",
141 "IPEDS surveys w/ 100-D GloVe model",
142 "AMS surveys w/ 200-D GloVe model",
143 "CBMS surveys w/ 200-D GloVe model",
144 "IPEDS surveys w/ 200-D GloVe model",
145 "AMS surveys w/ 300-D GloVe model",
146 "CBMS surveys w/ 300-D GloVe model",
147 "IPEDS surveys w/ 300-D GloVe model",
148 "AMS surveys w/ 50-D GloVe model",
149 "CBMS surveys w/ 50-D GloVe model",
150 "IPEDS surveys w/ 50-D GloVe model",
151 "AMS surveys w/ Google News model",
152 "CBMS surveys w/ Google News model",
153 "IPEDS surveys w/ Google News model"
154)
155
156i <- 1
157pdf("plots_sema.pdf", onefile = TRUE)
158for (data in data_sources) {
159 data <- data[complete.cases(data),]
160 print(
161 ggplot(data) +
162 geom_bar(aes(y = word_category, fill = word_category)) +
163 labs(
164 title = paste("Most frequent tags in", names[[i]]),
165 x = "Occurrences",
166 y = "Tags"
167 ) +
168 theme_minimal() +
169 theme(aspect.ratio = 1)
170 )
171 i <- i + 1
172}
173dev.off()