code/generate_data.R at main · stau.space/underrepresentation-theory

this repo has no description
underrepresentation-theory / code / generate_data.R
at main 242 lines 9.5 kB view raw
  1# install.packages("tidyverse")
  2library(tibble) # tibble (comes from tidyverse)
  3# install.packages("word2vec")
  4library(word2vec) # read.wordvectors
  5# install.packages("reticulate")
  6library(reticulate) # reticulate::py_eval
  7# install.packages("tidytext")
  8library(tidytext) # data("stop_words")
  9# install.packages("dplyr")
 10library(dplyr) # anti_join
 11# install.packages("SnowballC")
 12library(SnowballC) # wordStem
 13# install.packages("gglot2")
 14library(ggplot2) # ggplot
 15
 16# This will load the data set needed for stop words
 17data("stop_words")
 18
 19# w2v : (String, Embedding) -> Vector
 20# This function, given a string and an embedding will return the vector
 21# associated to that word. In the case where the word cannot be found in the
 22# embedding, the vector returned is the zero vector. Additionally, the word is
 23# cleaned before processing it.
 24w2v <- function(str, emb) {
 25	clean_str <- txt_clean_word2vec(str)
 26	if (any(clean_str == rownames(emb))) emb[clean_str,] else numeric(ncol(emb))
 27}
 28
 29# w2v_v : ([String], Embedding) -> Matrix
 30# This function, given a list of strings and an embedding will return the
 31# vectors associated to each word as a matrix. Each column of the Matrix
 32# returned corresponds to the vectors associated to that word. Because of this,
 33# you can access corresponding vectors like:
 34# > apple <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"apple"]
 35# > grape <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"grape"]
 36w2v_v <- function(words, emb) Vectorize(function(str) w2v(str, emb))(words)
 37
 38# sim : (String, String, Embedding) -> Matrix
 39# sim : (String, String, Embedding, String) -> Matrix
 40# sim : ([String], [String], Embedding) -> Matrix
 41# sim : ([String], [String], Embedding, String) -> Matrix
 42# This function, given two strings or lists of strings and an embedding, will
 43# return the semantic similarity between those two words or list of words as a
 44# Floating-point Matrix where each entry is between -1.0 and 1.0. This semantic
 45# similarity is just the cosine of the angle between the two vectors.
 46sim <- function(w1, w2, emb, type = "cosine") word2vec_similarity(w2v(w1, emb), w2v(w2, emb), type = type)
 47
 48# When Python writes a pandas.DataFrame into a CSV, it will run each cell in the
 49# DataFrame through repr and write that into the CSV. This means that when R
 50# reads the csv, R will interpret each cell in the CSV as a string, instead of
 51# the correct type that is supposed to actually be there.
 52py2r <- function(df, rows) {
 53	for (row in rows) {
 54		# In this case we use reticulate::py_eval to convert a python string
 55		# into the R representation.
 56		df[[row]] <- lapply(df[[row]], reticulate::py_eval)
 57	}
 58	return(df)
 59}
 60
 61# rep : (String, Integer) -> [String]
 62# This function, given a string and an integer, will return a list containing
 63# the string repeated however many times the integer says it should.
 64rep <- function(str, count) {
 65	out <- character(length = count)
 66	for (i in 1:count) {
 67		out[i] <- str
 68	}
 69	out
 70}
 71
 72# These are the tags that are used to categorize the data.
 73lgbt_tags <- c("lgbt","lgbtq","sex","identity","gender","orientation","nonbinary") |> as.character()
 74race_ethnicity_tags <- c("race","ethnicity","african","american","black","hispanic","asian","indigenous","native","latino","latina","latine") |> as.character()
 75women_tags <- c("woman","women","girl","feminine","femeninity","ms","mrs") |> as.character()
 76men_tags <- c("man", "men", "boy", "male", "masculine", "masculinity", "mr") |> as.character()
 77disabilities_tags <- c("disabilities","disabled","disability","handicap","handicapped","neurodivergent") |> as.character()
 78
 79# This variable holds all of the tags. Additionally, tag_categories holds all
 80# the tags together with their categories.
 81tags <- c(lgbt_tags, race_ethnicity_tags, women_tags, men_tags, disabilities_tags)
 82tag_categories <- c(
 83	rep("lgbt", length(lgbt_tags)),
 84	rep("race/ethnicity", length(race_ethnicity_tags)),
 85	rep("women", length(women_tags)),
 86	rep("men", length(men_tags)),
 87	rep("disabilities", length(disabilities_tags))
 88)
 89
 90# 
 91similarity_values <- numeric(length = length(tags))
 92word_category <- function(word, tag_vectors, emb, threshold = 0.3) {
 93	for (i in seq_along(tags)) {
 94		similarity_values[i] <- word2vec_similarity(w2v(word, emb), tag_vectors[,i], type = "cosine")
 95	}
 96
 97	similarities <- data.frame(
 98			sim = similarity_values,
 99			tag = tags,
100			tag_category = tag_categories
101	)
102
103	similarities <- similarities[similarities |> complete.cases(),]
104	m <- max(similarities $ sim, na.rm = TRUE)
105	if (m > threshold) similarities[m == similarities,] |> head(1) else data.frame(sim = NA, tag = NA, tag_category = NA)
106}
107
108memo <- new.env(hash = TRUE, parent = emptyenv())
109word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) {
110	if (is.null(memo[[x]])) {
111		memo[[x]] <- word_category(x, tag_vectors, emb, threshold)
112	}
113	return(memo[[x]])
114}
115
116word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) {
117	res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb))(words) |> t()
118	sim <- numeric(length = length(words))
119	tags <- character(length = length(words))
120	tag_categories <- character(length = length(words))
121	for (i in 1:nrow(res)) {
122		sim[i] <- res[,"sim"][[i]]
123		tags[i] <- res[,"tag"][[i]]
124		tag_categories[i] <- res[,"tag_category"][[i]]
125	}
126	data.frame(
127		sim = sim,
128		tag = tags,
129		tag_category = tag_categories
130	)
131}
132
133# Read data from CSV and convert it into its R representation
134data <- read.csv("data/variables.csv") |> py2r(rows = 4:9)
135ams_selection <- data[,"title"] == "American Mathematical Society"
136data[ams_selection,"title"] <- "AMS"
137cbms_selection <- data[,"title"] == "Conference Board of the Mathematical Sciences 2021 Survey"
138data[cbms_selection,"title"] <- "CBMS"
139ipeds_selection <- data[,"title"] == "Integrated Postsecondary Education Data System (IPEDS) Institution Lookup"
140data[ipeds_selection,"title"] <- "IPEDS"
141selection <- ams_selection | cbms_selection | ipeds_selection
142
143data <- data[selection,]
144
145clean_text <- function(raw_text) {
146	tmp <- tibble(
147			line = seq_along(raw_text),
148			text = raw_text
149		) |> unnest_tokens(word, text)
150	tmp[!grepl("\\d", tmp $ word),] |> anti_join(stop_words)
151}
152
153# Word stems analysis
154
155for (row_idx in 1:nrow(data)) {
156	title <- data[row_idx,"title"]
157	stem_txt <- clean_text(data[row_idx,"variables"][[1]]) |>
158		mutate(word_stem = wordStem(word))
159
160	stem_count <- stem_txt |>
161		inner_join(count(stem_txt, word_stem)) |>
162		filter(n > 5) |>
163		distinct(word_stem, .keep_all = TRUE)
164
165	save(stem_count, file = paste("data/stem_", title, ".Rda", sep = ""))
166}
167
168
169# Word semantics analysis
170
171word_semantic_analysis <- function(emb, data, model_name, threshold = 0.3) {
172	tag_vectors <- w2v_v(tags, emb)
173	for (row_idx in 1:nrow(data)) {
174		title <- data[row_idx,"title"]
175		clean_txt <- clean_text(data[row_idx, "variables"][[1]])
176
177		memo <- new.env(hash = TRUE, parent = emptyenv())
178		word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) {
179			if (is.null(memo[[x]])) {
180				memo[[x]] <- word_category(x, tag_vectors, emb, threshold)
181			}
182			return(memo[[x]])
183		}
184
185		word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) {
186			res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb, threshold))(words) |> t()
187			sim <- numeric(length = length(words))
188			tags <- character(length = length(words))
189			tag_categories <- character(length = length(words))
190			for (i in 1:nrow(res)) {
191				sim[i] <- res[,"sim"][[i]]
192				tags[i] <- res[,"tag"][[i]]
193				tag_categories[i] <- res[,"tag_category"][[i]]
194			}
195			data.frame(
196				sim = sim,
197				tag = tags,
198				tag_category = tag_categories
199			)
200		}
201
202		word_categories <- word_category_v(clean_txt $ word, tag_vectors, emb, threshold)
203		sema_txt <- clean_txt |>
204			mutate(tag = word_categories $ tag, word_category = word_categories $ tag_category)
205
206		sema_count <- sema_txt |>
207			inner_join(count(sema_txt, tag)) |>
208			distinct(tag, .keep_all = TRUE)
209
210		save(sema_count, file = paste("data/", model_name, "_", threshold, "_", title, ".Rda", sep = ""))
211	}
212}
213
214# emb (short for embedding) is a matrix with 3,000,000 rows and 300 columns.
215# Each row represents a point in 300-dimensional space. Since this is a
216# two-dimensional matrix, you can access a specific coordinate using:
217# > emb["your_word_here",number_of_coordinate_here]
218# However, what we want to do is associate an english word with a point in 300
219# dimensions. The way in which we will be using emb is:
220# > grape <- emb["apple",]
221# > apple <- emb["grape",]
222# After this, we can calculate how "close" grape and apple are semantically:
223# > word2vec_similarity(grape, apple, type = "cosine")
224# This returns a number between 0.0 and 1.0, where 0.0 represents completely
225# different words and 1.0 represents the same word.
226# For more information on how this works check out:
227# - https://code.google.com/archive/p/word2vec/
228
229read.wordvectors("google_vecs.bin", type = "bin") |>
230	word_semantic_analysis(data = data, model_name = "google_news", threshold = 0.185)
231
232read.wordvectors("glove.6B.300d.txt", type = "txt") |>
233	word_semantic_analysis(data = data, model_name = "glove_300d", threshold = 0.134)
234
235read.wordvectors("glove.6B.200d.txt", type = "txt") |>
236	word_semantic_analysis(data = data, model_name = "glove_200d", threshold = 0.164)
237
238read.wordvectors("glove.6B.100d.txt", type = "txt") |>
239	word_semantic_analysis(data = data, model_name = "glove_100d", threshold = 0.219)
240
241read.wordvectors("glove.6B.50d.txt", type = "txt") |>
242	word_semantic_analysis(data = data, model_name = "glove_50d", threshold = 0.273)