this repo has no description
1# install.packages("tidyverse")
2library(tibble) # tibble (comes from tidyverse)
3# install.packages("word2vec")
4library(word2vec) # read.wordvectors
5# install.packages("reticulate")
6library(reticulate) # reticulate::py_eval
7# install.packages("tidytext")
8library(tidytext) # data("stop_words")
9# install.packages("dplyr")
10library(dplyr) # anti_join
11# install.packages("SnowballC")
12library(SnowballC) # wordStem
13# install.packages("gglot2")
14library(ggplot2) # ggplot
15
16# This will load the data set needed for stop words
17data("stop_words")
18
19# w2v : (String, Embedding) -> Vector
20# This function, given a string and an embedding will return the vector
21# associated to that word. In the case where the word cannot be found in the
22# embedding, the vector returned is the zero vector. Additionally, the word is
23# cleaned before processing it.
24w2v <- function(str, emb) {
25 clean_str <- txt_clean_word2vec(str)
26 if (any(clean_str == rownames(emb))) emb[clean_str,] else numeric(ncol(emb))
27}
28
29# w2v_v : ([String], Embedding) -> Matrix
30# This function, given a list of strings and an embedding will return the
31# vectors associated to each word as a matrix. Each column of the Matrix
32# returned corresponds to the vectors associated to that word. Because of this,
33# you can access corresponding vectors like:
34# > apple <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"apple"]
35# > grape <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"grape"]
36w2v_v <- function(words, emb) Vectorize(function(str) w2v(str, emb))(words)
37
38# sim : (String, String, Embedding) -> Matrix
39# sim : (String, String, Embedding, String) -> Matrix
40# sim : ([String], [String], Embedding) -> Matrix
41# sim : ([String], [String], Embedding, String) -> Matrix
42# This function, given two strings or lists of strings and an embedding, will
43# return the semantic similarity between those two words or list of words as a
44# Floating-point Matrix where each entry is between -1.0 and 1.0. This semantic
45# similarity is just the cosine of the angle between the two vectors.
46sim <- function(w1, w2, emb, type = "cosine") word2vec_similarity(w2v(w1, emb), w2v(w2, emb), type = type)
47
48# When Python writes a pandas.DataFrame into a CSV, it will run each cell in the
49# DataFrame through repr and write that into the CSV. This means that when R
50# reads the csv, R will interpret each cell in the CSV as a string, instead of
51# the correct type that is supposed to actually be there.
52py2r <- function(df, rows) {
53 for (row in rows) {
54 # In this case we use reticulate::py_eval to convert a python string
55 # into the R representation.
56 df[[row]] <- lapply(df[[row]], reticulate::py_eval)
57 }
58 return(df)
59}
60
61# rep : (String, Integer) -> [String]
62# This function, given a string and an integer, will return a list containing
63# the string repeated however many times the integer says it should.
64rep <- function(str, count) {
65 out <- character(length = count)
66 for (i in 1:count) {
67 out[i] <- str
68 }
69 out
70}
71
72# These are the tags that are used to categorize the data.
73lgbt_tags <- c("lgbt","lgbtq","sex","identity","gender","orientation","nonbinary") |> as.character()
74race_ethnicity_tags <- c("race","ethnicity","african","american","black","hispanic","asian","indigenous","native","latino","latina","latine") |> as.character()
75women_tags <- c("woman","women","girl","feminine","femeninity","ms","mrs") |> as.character()
76men_tags <- c("man", "men", "boy", "male", "masculine", "masculinity", "mr") |> as.character()
77disabilities_tags <- c("disabilities","disabled","disability","handicap","handicapped","neurodivergent") |> as.character()
78
79# This variable holds all of the tags. Additionally, tag_categories holds all
80# the tags together with their categories.
81tags <- c(lgbt_tags, race_ethnicity_tags, women_tags, men_tags, disabilities_tags)
82tag_categories <- c(
83 rep("lgbt", length(lgbt_tags)),
84 rep("race/ethnicity", length(race_ethnicity_tags)),
85 rep("women", length(women_tags)),
86 rep("men", length(men_tags)),
87 rep("disabilities", length(disabilities_tags))
88)
89
90#
91similarity_values <- numeric(length = length(tags))
92word_category <- function(word, tag_vectors, emb, threshold = 0.3) {
93 for (i in seq_along(tags)) {
94 similarity_values[i] <- word2vec_similarity(w2v(word, emb), tag_vectors[,i], type = "cosine")
95 }
96
97 similarities <- data.frame(
98 sim = similarity_values,
99 tag = tags,
100 tag_category = tag_categories
101 )
102
103 similarities <- similarities[similarities |> complete.cases(),]
104 m <- max(similarities $ sim, na.rm = TRUE)
105 if (m > threshold) similarities[m == similarities,] |> head(1) else data.frame(sim = NA, tag = NA, tag_category = NA)
106}
107
108memo <- new.env(hash = TRUE, parent = emptyenv())
109word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) {
110 if (is.null(memo[[x]])) {
111 memo[[x]] <- word_category(x, tag_vectors, emb, threshold)
112 }
113 return(memo[[x]])
114}
115
116word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) {
117 res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb))(words) |> t()
118 sim <- numeric(length = length(words))
119 tags <- character(length = length(words))
120 tag_categories <- character(length = length(words))
121 for (i in 1:nrow(res)) {
122 sim[i] <- res[,"sim"][[i]]
123 tags[i] <- res[,"tag"][[i]]
124 tag_categories[i] <- res[,"tag_category"][[i]]
125 }
126 data.frame(
127 sim = sim,
128 tag = tags,
129 tag_category = tag_categories
130 )
131}
132
133# Read data from CSV and convert it into its R representation
134data <- read.csv("data/variables.csv") |> py2r(rows = 4:9)
135ams_selection <- data[,"title"] == "American Mathematical Society"
136data[ams_selection,"title"] <- "AMS"
137cbms_selection <- data[,"title"] == "Conference Board of the Mathematical Sciences 2021 Survey"
138data[cbms_selection,"title"] <- "CBMS"
139ipeds_selection <- data[,"title"] == "Integrated Postsecondary Education Data System (IPEDS) Institution Lookup"
140data[ipeds_selection,"title"] <- "IPEDS"
141selection <- ams_selection | cbms_selection | ipeds_selection
142
143data <- data[selection,]
144
145clean_text <- function(raw_text) {
146 tmp <- tibble(
147 line = seq_along(raw_text),
148 text = raw_text
149 ) |> unnest_tokens(word, text)
150 tmp[!grepl("\\d", tmp $ word),] |> anti_join(stop_words)
151}
152
153# Word stems analysis
154
155for (row_idx in 1:nrow(data)) {
156 title <- data[row_idx,"title"]
157 stem_txt <- clean_text(data[row_idx,"variables"][[1]]) |>
158 mutate(word_stem = wordStem(word))
159
160 stem_count <- stem_txt |>
161 inner_join(count(stem_txt, word_stem)) |>
162 filter(n > 5) |>
163 distinct(word_stem, .keep_all = TRUE)
164
165 save(stem_count, file = paste("data/stem_", title, ".Rda", sep = ""))
166}
167
168
169# Word semantics analysis
170
171word_semantic_analysis <- function(emb, data, model_name, threshold = 0.3) {
172 tag_vectors <- w2v_v(tags, emb)
173 for (row_idx in 1:nrow(data)) {
174 title <- data[row_idx,"title"]
175 clean_txt <- clean_text(data[row_idx, "variables"][[1]])
176
177 memo <- new.env(hash = TRUE, parent = emptyenv())
178 word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) {
179 if (is.null(memo[[x]])) {
180 memo[[x]] <- word_category(x, tag_vectors, emb, threshold)
181 }
182 return(memo[[x]])
183 }
184
185 word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) {
186 res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb, threshold))(words) |> t()
187 sim <- numeric(length = length(words))
188 tags <- character(length = length(words))
189 tag_categories <- character(length = length(words))
190 for (i in 1:nrow(res)) {
191 sim[i] <- res[,"sim"][[i]]
192 tags[i] <- res[,"tag"][[i]]
193 tag_categories[i] <- res[,"tag_category"][[i]]
194 }
195 data.frame(
196 sim = sim,
197 tag = tags,
198 tag_category = tag_categories
199 )
200 }
201
202 word_categories <- word_category_v(clean_txt $ word, tag_vectors, emb, threshold)
203 sema_txt <- clean_txt |>
204 mutate(tag = word_categories $ tag, word_category = word_categories $ tag_category)
205
206 sema_count <- sema_txt |>
207 inner_join(count(sema_txt, tag)) |>
208 distinct(tag, .keep_all = TRUE)
209
210 save(sema_count, file = paste("data/", model_name, "_", threshold, "_", title, ".Rda", sep = ""))
211 }
212}
213
214# emb (short for embedding) is a matrix with 3,000,000 rows and 300 columns.
215# Each row represents a point in 300-dimensional space. Since this is a
216# two-dimensional matrix, you can access a specific coordinate using:
217# > emb["your_word_here",number_of_coordinate_here]
218# However, what we want to do is associate an english word with a point in 300
219# dimensions. The way in which we will be using emb is:
220# > grape <- emb["apple",]
221# > apple <- emb["grape",]
222# After this, we can calculate how "close" grape and apple are semantically:
223# > word2vec_similarity(grape, apple, type = "cosine")
224# This returns a number between 0.0 and 1.0, where 0.0 represents completely
225# different words and 1.0 represents the same word.
226# For more information on how this works check out:
227# - https://code.google.com/archive/p/word2vec/
228
229read.wordvectors("google_vecs.bin", type = "bin") |>
230 word_semantic_analysis(data = data, model_name = "google_news", threshold = 0.185)
231
232read.wordvectors("glove.6B.300d.txt", type = "txt") |>
233 word_semantic_analysis(data = data, model_name = "glove_300d", threshold = 0.134)
234
235read.wordvectors("glove.6B.200d.txt", type = "txt") |>
236 word_semantic_analysis(data = data, model_name = "glove_200d", threshold = 0.164)
237
238read.wordvectors("glove.6B.100d.txt", type = "txt") |>
239 word_semantic_analysis(data = data, model_name = "glove_100d", threshold = 0.219)
240
241read.wordvectors("glove.6B.50d.txt", type = "txt") |>
242 word_semantic_analysis(data = data, model_name = "glove_50d", threshold = 0.273)