# install.packages("tidyverse") library(tibble) # tibble (comes from tidyverse) # install.packages("word2vec") library(word2vec) # read.wordvectors # install.packages("reticulate") library(reticulate) # reticulate::py_eval # install.packages("tidytext") library(tidytext) # data("stop_words") # install.packages("dplyr") library(dplyr) # anti_join # install.packages("SnowballC") library(SnowballC) # wordStem # install.packages("gglot2") library(ggplot2) # ggplot # This will load the data set needed for stop words data("stop_words") # w2v : (String, Embedding) -> Vector # This function, given a string and an embedding will return the vector # associated to that word. In the case where the word cannot be found in the # embedding, the vector returned is the zero vector. Additionally, the word is # cleaned before processing it. w2v <- function(str, emb) { clean_str <- txt_clean_word2vec(str) if (any(clean_str == rownames(emb))) emb[clean_str,] else numeric(ncol(emb)) } # w2v_v : ([String], Embedding) -> Matrix # This function, given a list of strings and an embedding will return the # vectors associated to each word as a matrix. Each column of the Matrix # returned corresponds to the vectors associated to that word. Because of this, # you can access corresponding vectors like: # > apple <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"apple"] # > grape <- w2v_v(list("apple", "grape") |> as.characte(), emb)[,"grape"] w2v_v <- function(words, emb) Vectorize(function(str) w2v(str, emb))(words) # sim : (String, String, Embedding) -> Matrix # sim : (String, String, Embedding, String) -> Matrix # sim : ([String], [String], Embedding) -> Matrix # sim : ([String], [String], Embedding, String) -> Matrix # This function, given two strings or lists of strings and an embedding, will # return the semantic similarity between those two words or list of words as a # Floating-point Matrix where each entry is between -1.0 and 1.0. This semantic # similarity is just the cosine of the angle between the two vectors. sim <- function(w1, w2, emb, type = "cosine") word2vec_similarity(w2v(w1, emb), w2v(w2, emb), type = type) # When Python writes a pandas.DataFrame into a CSV, it will run each cell in the # DataFrame through repr and write that into the CSV. This means that when R # reads the csv, R will interpret each cell in the CSV as a string, instead of # the correct type that is supposed to actually be there. py2r <- function(df, rows) { for (row in rows) { # In this case we use reticulate::py_eval to convert a python string # into the R representation. df[[row]] <- lapply(df[[row]], reticulate::py_eval) } return(df) } # rep : (String, Integer) -> [String] # This function, given a string and an integer, will return a list containing # the string repeated however many times the integer says it should. rep <- function(str, count) { out <- character(length = count) for (i in 1:count) { out[i] <- str } out } # These are the tags that are used to categorize the data. lgbt_tags <- c("lgbt","lgbtq","sex","identity","gender","orientation","nonbinary") |> as.character() race_ethnicity_tags <- c("race","ethnicity","african","american","black","hispanic","asian","indigenous","native","latino","latina","latine") |> as.character() women_tags <- c("woman","women","girl","feminine","femeninity","ms","mrs") |> as.character() men_tags <- c("man", "men", "boy", "male", "masculine", "masculinity", "mr") |> as.character() disabilities_tags <- c("disabilities","disabled","disability","handicap","handicapped","neurodivergent") |> as.character() # This variable holds all of the tags. Additionally, tag_categories holds all # the tags together with their categories. tags <- c(lgbt_tags, race_ethnicity_tags, women_tags, men_tags, disabilities_tags) tag_categories <- c( rep("lgbt", length(lgbt_tags)), rep("race/ethnicity", length(race_ethnicity_tags)), rep("women", length(women_tags)), rep("men", length(men_tags)), rep("disabilities", length(disabilities_tags)) ) # similarity_values <- numeric(length = length(tags)) word_category <- function(word, tag_vectors, emb, threshold = 0.3) { for (i in seq_along(tags)) { similarity_values[i] <- word2vec_similarity(w2v(word, emb), tag_vectors[,i], type = "cosine") } similarities <- data.frame( sim = similarity_values, tag = tags, tag_category = tag_categories ) similarities <- similarities[similarities |> complete.cases(),] m <- max(similarities $ sim, na.rm = TRUE) if (m > threshold) similarities[m == similarities,] |> head(1) else data.frame(sim = NA, tag = NA, tag_category = NA) } memo <- new.env(hash = TRUE, parent = emptyenv()) word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) { if (is.null(memo[[x]])) { memo[[x]] <- word_category(x, tag_vectors, emb, threshold) } return(memo[[x]]) } word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) { res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb))(words) |> t() sim <- numeric(length = length(words)) tags <- character(length = length(words)) tag_categories <- character(length = length(words)) for (i in 1:nrow(res)) { sim[i] <- res[,"sim"][[i]] tags[i] <- res[,"tag"][[i]] tag_categories[i] <- res[,"tag_category"][[i]] } data.frame( sim = sim, tag = tags, tag_category = tag_categories ) } # Read data from CSV and convert it into its R representation data <- read.csv("data/variables.csv") |> py2r(rows = 4:9) ams_selection <- data[,"title"] == "American Mathematical Society" data[ams_selection,"title"] <- "AMS" cbms_selection <- data[,"title"] == "Conference Board of the Mathematical Sciences 2021 Survey" data[cbms_selection,"title"] <- "CBMS" ipeds_selection <- data[,"title"] == "Integrated Postsecondary Education Data System (IPEDS) Institution Lookup" data[ipeds_selection,"title"] <- "IPEDS" selection <- ams_selection | cbms_selection | ipeds_selection data <- data[selection,] clean_text <- function(raw_text) { tmp <- tibble( line = seq_along(raw_text), text = raw_text ) |> unnest_tokens(word, text) tmp[!grepl("\\d", tmp $ word),] |> anti_join(stop_words) } # Word stems analysis for (row_idx in 1:nrow(data)) { title <- data[row_idx,"title"] stem_txt <- clean_text(data[row_idx,"variables"][[1]]) |> mutate(word_stem = wordStem(word)) stem_count <- stem_txt |> inner_join(count(stem_txt, word_stem)) |> filter(n > 5) |> distinct(word_stem, .keep_all = TRUE) save(stem_count, file = paste("data/stem_", title, ".Rda", sep = "")) } # Word semantics analysis word_semantic_analysis <- function(emb, data, model_name, threshold = 0.3) { tag_vectors <- w2v_v(tags, emb) for (row_idx in 1:nrow(data)) { title <- data[row_idx,"title"] clean_txt <- clean_text(data[row_idx, "variables"][[1]]) memo <- new.env(hash = TRUE, parent = emptyenv()) word_category_m <- function(x, tag_vectors, emb, threshold = 0.3) { if (is.null(memo[[x]])) { memo[[x]] <- word_category(x, tag_vectors, emb, threshold) } return(memo[[x]]) } word_category_v <- function(words, tag_vectors, emb, threshold = 0.3) { res <- Vectorize(function(word) word_category_m(word, tag_vectors, emb, threshold))(words) |> t() sim <- numeric(length = length(words)) tags <- character(length = length(words)) tag_categories <- character(length = length(words)) for (i in 1:nrow(res)) { sim[i] <- res[,"sim"][[i]] tags[i] <- res[,"tag"][[i]] tag_categories[i] <- res[,"tag_category"][[i]] } data.frame( sim = sim, tag = tags, tag_category = tag_categories ) } word_categories <- word_category_v(clean_txt $ word, tag_vectors, emb, threshold) sema_txt <- clean_txt |> mutate(tag = word_categories $ tag, word_category = word_categories $ tag_category) sema_count <- sema_txt |> inner_join(count(sema_txt, tag)) |> distinct(tag, .keep_all = TRUE) save(sema_count, file = paste("data/", model_name, "_", threshold, "_", title, ".Rda", sep = "")) } } # emb (short for embedding) is a matrix with 3,000,000 rows and 300 columns. # Each row represents a point in 300-dimensional space. Since this is a # two-dimensional matrix, you can access a specific coordinate using: # > emb["your_word_here",number_of_coordinate_here] # However, what we want to do is associate an english word with a point in 300 # dimensions. The way in which we will be using emb is: # > grape <- emb["apple",] # > apple <- emb["grape",] # After this, we can calculate how "close" grape and apple are semantically: # > word2vec_similarity(grape, apple, type = "cosine") # This returns a number between 0.0 and 1.0, where 0.0 represents completely # different words and 1.0 represents the same word. # For more information on how this works check out: # - https://code.google.com/archive/p/word2vec/ read.wordvectors("google_vecs.bin", type = "bin") |> word_semantic_analysis(data = data, model_name = "google_news", threshold = 0.185) read.wordvectors("glove.6B.300d.txt", type = "txt") |> word_semantic_analysis(data = data, model_name = "glove_300d", threshold = 0.134) read.wordvectors("glove.6B.200d.txt", type = "txt") |> word_semantic_analysis(data = data, model_name = "glove_200d", threshold = 0.164) read.wordvectors("glove.6B.100d.txt", type = "txt") |> word_semantic_analysis(data = data, model_name = "glove_100d", threshold = 0.219) read.wordvectors("glove.6B.50d.txt", type = "txt") |> word_semantic_analysis(data = data, model_name = "glove_50d", threshold = 0.273)