Anime Metadata DB data transformations
at master 2.9 kB view raw
1import pandas as pd 2import json 3from difflib import SequenceMatcher 4 5# === Cargar archivos JSON locales === 6with open("../data/arm.json", "r", encoding="utf-8") as f: 7 arm = json.load(f) 8with open("../data/kitsu.json", "r", encoding="utf-8") as f: 9 kitsu = json.load(f) 10with open("../data/shoboiTitles.json", "r", encoding="utf-8") as f: 11 shoboi = json.load(f) 12 13# === Convertir a DataFrames === 14arm_df = pd.DataFrame(arm) 15kitsu_df = pd.DataFrame(kitsu) 16shoboi_df = pd.DataFrame(shoboi) 17 18# === Normalizar tipos de ID === 19arm_df["kitsu_id"] = arm_df["kitsu_id"].astype(str).str.strip() 20kitsu_df["kitsu_id"] = kitsu_df["kitsu_id"].astype(str).str.strip() 21shoboi_df["tid"] = shoboi_df["tid"].astype(str).str.strip() 22 23# === CRUCE POR IDs === 24merged = arm_df.merge(kitsu_df, on="kitsu_id", how="inner", suffixes=("_arm", "_kitsu")) 25merged = merged.merge(shoboi_df, left_on="syobocal_tid", right_on="tid", how="left", suffixes=("", "_shoboi")) 26 27# Extraer títulos de Kitsu si existen 28def extract_kitsu_title(row): 29 t = row.get("title_kitsu") or row.get("title") # dependiendo del nombre real de la columna 30 if isinstance(t, dict): 31 return t.get("romaji") or t.get("english") or t.get("native") 32 return None 33 34merged["kitsu_title"] = merged.apply(extract_kitsu_title, axis=1) 35 36id_cross = merged[[ 37 "kitsu_id", 38 "kitsu_title", 39 "syobocal_tid", 40 "tid", 41 "title", 42 "mal_id", 43 "anilist_id", 44]].rename(columns={"title": "shoboi_title"}).drop_duplicates() 45 46# === CRUCE POR SIMILITUD DE TÍTULOS === 47def similarity(a, b): 48 return SequenceMatcher(None, a, b).ratio() 49 50title_matches = [] 51for _, krow in kitsu_df.iterrows(): 52 titles = krow.get("title", {}) 53 k_titles = [titles.get("romaji", ""), titles.get("english", ""), titles.get("native", "")] 54 for _, srow in shoboi_df.iterrows(): 55 s_title = srow.get("title", "") 56 for kt in k_titles: 57 if kt and s_title: 58 sim = similarity(kt.lower(), s_title.lower()) 59 if sim > 0.85: 60 title_matches.append({ 61 "kitsu_id": krow["kitsu_id"], 62 "kitsu_title": titles.get("romaji", ""), 63 "tid": srow["tid"], 64 "shoboi_title": s_title, 65 "similitud": round(sim, 2) 66 }) 67 68title_cross = pd.DataFrame(title_matches) 69 70# === COMBINAR RESULTADOS === 71combined = pd.concat([id_cross, title_cross], ignore_index=True) 72combined = combined.drop_duplicates(subset=["kitsu_id", "tid"], keep="first") 73 74# === MOSTRAR RESUMEN === 75print("=== Cruce resumido entre los tres JSON ===") 76print(combined.to_string(index=False)) 77print(f"\nTotal de coincidencias encontradas: {len(combined)}") 78 79# === OPCIONAL: exportar a CSV === 80combined.to_csv("crossref_result.csv", index=False, encoding="utf-8-sig")