this repo has no description
0
fork

Configure Feed

Select the types of activity you want to include in your feed.

feat: Added and calculated synonym values for models

The file generate_synonyms_data.R will generate R data files that hold a
dataframe called "data" inside the data/ directory. These dataframes
hold data that can be used to calculate what the average similarity of
synonyms are using the synonyms inside syn.csv.

Signed-off-by: Sona Tau Estrada Rivera <sona@stau.space>

+97
code/data/glove_100d_syn.Rda

This is a binary file and will not be displayed.

code/data/glove_200d_syn.Rda

This is a binary file and will not be displayed.

code/data/glove_300d_syn.Rda

This is a binary file and will not be displayed.

code/data/glove_50d_syn.Rda

This is a binary file and will not be displayed.

code/data/google_vecs_syn.Rda

This is a binary file and will not be displayed.

+6
code/data/synonym_thresholds.csv
··· 1 + model,min,q1,median,mean,q3,max 2 + google news,-0.169,0.099,0.185,0.216,0.303,1.000 3 + glove 6B 300D,-0.375,0.035,0.134,0.154,0.251,1.000 4 + glove 6B 200D,-0.410,0.050,0.164,0.182,0.297,1.000 5 + glove 6B 100D,-0.493,0.072,0.219,0.227,0.374,1.000 6 + glove 6B 50D,-0.602,0.094,0.273,0.270,0.449,1.000
+91
code/generate_synonyms_data.R
··· 1 + # install.packages("tidyverse") 2 + library(tibble) # tibble (comes from tidyverse) 3 + # install.packages("word2vec") 4 + library(word2vec) # read.wordvectors 5 + # install.packages("reticulate") 6 + library(reticulate) # reticulate::py_eval 7 + # install.packages("tidytext") 8 + library(tidytext) # data("stop_words") 9 + # install.packages("dplyr") 10 + library(dplyr) # anti_join 11 + # install.packages("SnowballC") 12 + library(SnowballC) # wordStem 13 + # install.packages("gglot2") 14 + library(ggplot2) # ggplot 15 + 16 + syn <- read.csv("syn.csv") 17 + 18 + # --- google vectors --- 19 + 20 + emb <- read.wordvectors("google_vecs.bin", type = "bin") 21 + w2v <- function(x) { 22 + y <- txt_clean_word2vec(x) 23 + if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 24 + } 25 + w2v_v <- Vectorize(w2v) 26 + 27 + sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 28 + sim_v <- Vectorize(sim) 29 + 30 + data <- sim_v(syn $ word, syn $ synonym) 31 + save(data, file = "data/google_vecs_syn.Rda") 32 + 33 + # --- glove 300d --- 34 + 35 + emb <- read.wordvectors("glove.6B.300d.txt", type = "txt") 36 + w2v <- function(x) { 37 + y <- txt_clean_word2vec(x) 38 + if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 39 + } 40 + w2v_v <- Vectorize(w2v) 41 + 42 + sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 43 + sim_v <- Vectorize(sim) 44 + 45 + data <- sim_v(syn $ word, syn $ synonym) 46 + save(data, file = "data/glove_300d_syn.Rda") 47 + 48 + # --- glove 200d --- 49 + 50 + emb <- read.wordvectors("glove.6B.200d.txt", type = "txt") 51 + w2v <- function(x) { 52 + y <- txt_clean_word2vec(x) 53 + if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 54 + } 55 + w2v_v <- Vectorize(w2v) 56 + 57 + sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 58 + sim_v <- Vectorize(sim) 59 + 60 + data <- sim_v(syn $ word, syn $ synonym) 61 + save(data, file = "data/glove_200d_syn.Rda") 62 + 63 + # --- glove 100d --- 64 + 65 + emb <- read.wordvectors("glove.6B.100d.txt", type = "txt") 66 + w2v <- function(x) { 67 + y <- txt_clean_word2vec(x) 68 + if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 69 + } 70 + w2v_v <- Vectorize(w2v) 71 + 72 + sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 73 + sim_v <- Vectorize(sim) 74 + 75 + data <- sim_v(syn $ word, syn $ synonym) 76 + save(data, file = "data/glove_100d_syn.Rda") 77 + 78 + # --- glove 50d --- 79 + 80 + emb <- read.wordvectors("glove.6B.50d.txt", type = "txt") 81 + w2v <- function(x) { 82 + y <- txt_clean_word2vec(x) 83 + if (any(y == rownames(emb))) emb[y,] else numeric(ncol(emb)) 84 + } 85 + w2v_v <- Vectorize(w2v) 86 + 87 + sim <- function(x, y) word2vec_similarity(w2v(x), w2v(y), type = "cosine") 88 + sim_v <- Vectorize(sim) 89 + 90 + data <- sim_v(syn $ word, syn $ synonym) 91 + save(data, file = "data/glove_50d_syn.Rda")