Anime Metadata DB data transformations
1import pandas as pd
2import json
3from difflib import SequenceMatcher
4
5# === Cargar archivos JSON locales ===
6with open("../data/arm.json", "r", encoding="utf-8") as f:
7 arm = json.load(f)
8with open("../data/kitsu.json", "r", encoding="utf-8") as f:
9 kitsu = json.load(f)
10with open("../data/shoboiTitles.json", "r", encoding="utf-8") as f:
11 shoboi = json.load(f)
12
13# === Convertir a DataFrames ===
14arm_df = pd.DataFrame(arm)
15kitsu_df = pd.DataFrame(kitsu)
16shoboi_df = pd.DataFrame(shoboi)
17
18# === Normalizar tipos de ID ===
19arm_df["kitsu_id"] = arm_df["kitsu_id"].astype(str).str.strip()
20kitsu_df["kitsu_id"] = kitsu_df["kitsu_id"].astype(str).str.strip()
21shoboi_df["tid"] = shoboi_df["tid"].astype(str).str.strip()
22
23# === CRUCE POR IDs ===
24merged = arm_df.merge(kitsu_df, on="kitsu_id", how="inner", suffixes=("_arm", "_kitsu"))
25merged = merged.merge(shoboi_df, left_on="syobocal_tid", right_on="tid", how="left", suffixes=("", "_shoboi"))
26
27# Extraer títulos de Kitsu si existen
28def extract_kitsu_title(row):
29 t = row.get("title_kitsu") or row.get("title") # dependiendo del nombre real de la columna
30 if isinstance(t, dict):
31 return t.get("romaji") or t.get("english") or t.get("native")
32 return None
33
34merged["kitsu_title"] = merged.apply(extract_kitsu_title, axis=1)
35
36id_cross = merged[[
37 "kitsu_id",
38 "kitsu_title",
39 "syobocal_tid",
40 "tid",
41 "title",
42 "mal_id",
43 "anilist_id",
44]].rename(columns={"title": "shoboi_title"}).drop_duplicates()
45
46# === CRUCE POR SIMILITUD DE TÍTULOS ===
47def similarity(a, b):
48 return SequenceMatcher(None, a, b).ratio()
49
50title_matches = []
51for _, krow in kitsu_df.iterrows():
52 titles = krow.get("title", {})
53 k_titles = [titles.get("romaji", ""), titles.get("english", ""), titles.get("native", "")]
54 for _, srow in shoboi_df.iterrows():
55 s_title = srow.get("title", "")
56 for kt in k_titles:
57 if kt and s_title:
58 sim = similarity(kt.lower(), s_title.lower())
59 if sim > 0.85:
60 title_matches.append({
61 "kitsu_id": krow["kitsu_id"],
62 "kitsu_title": titles.get("romaji", ""),
63 "tid": srow["tid"],
64 "shoboi_title": s_title,
65 "similitud": round(sim, 2)
66 })
67
68title_cross = pd.DataFrame(title_matches)
69
70# === COMBINAR RESULTADOS ===
71combined = pd.concat([id_cross, title_cross], ignore_index=True)
72combined = combined.drop_duplicates(subset=["kitsu_id", "tid"], keep="first")
73
74# === MOSTRAR RESUMEN ===
75print("=== Cruce resumido entre los tres JSON ===")
76print(combined.to_string(index=False))
77print(f"\nTotal de coincidencias encontradas: {len(combined)}")
78
79# === OPCIONAL: exportar a CSV ===
80combined.to_csv("crossref_result.csv", index=False, encoding="utf-8-sig")