this repo has no description
at main 244 lines 10 kB view raw
1# %% 2import gc 3import re 4import csv 5import pandas as pd 6import io 7 8public_tables = [ 9 "account", 10 "archive_upload", 11 "followers", 12 "liked_tweets", 13 "likes", 14 "mentioned_users", 15 "profile", 16 "tweets", 17 "tweet_media", 18 "tweet_urls", 19 "user_mentions", 20] 21 22 23insertion_separator = """); 24""" 25 26row_separator = """), 27 (""" 28 29 30from functools import reduce 31 32 33def parse_sql_values(line): 34 def process_char(state, char): 35 values, current_value, in_quotes, parentheses_count = state 36 37 if char == "(" and not in_quotes: 38 return (values, current_value, in_quotes, parentheses_count + 1) 39 elif char == ")" and not in_quotes: 40 if parentheses_count == 1: 41 return (values + [current_value.strip()], "", in_quotes, 0) 42 return (values, current_value, in_quotes, parentheses_count - 1) 43 elif char == "'" and not in_quotes: 44 return (values, current_value, True, parentheses_count) 45 elif char == "'" and in_quotes: 46 if current_value.endswith("'"): 47 return (values, current_value + char, in_quotes, parentheses_count) 48 return (values, current_value, False, parentheses_count) 49 elif char == "," and not in_quotes and parentheses_count == 1: 50 return (values + [current_value.strip()], "", in_quotes, parentheses_count) 51 else: 52 return (values, current_value + char, in_quotes, parentheses_count) 53 54 initial_state = ([], "", False, 0) 55 final_state = reduce(process_char, line.strip(), initial_state) 56 return [v.strip("'").replace("''", "'") for v in final_state[0]] 57 58 59start_of_insert = """INSERT INTO "public".""" 60start_of_table = """ 61-- 62-- Data for Name:""" 63 64row_separator = """), 65 (""" 66row_separator = """),\n\t(""" 67 68 69def process_dump(input_file, output_dir): 70 current_table = None 71 current_table = None 72 table_cols = {} 73 table_rows = {} 74 75 insert_regex = re.compile( 76 r'INSERT INTO "public"\."(\w+)" \((.*?)\)(?: OVERRIDING SYSTEM VALUE)? VALUES' 77 ) 78 79 with open(input_file, "r", encoding="utf-8") as f: 80 content = f.read() 81 insert_iter = insert_regex.finditer(content) 82 for match in insert_iter: 83 84 current_table = match.group(1) 85 columns = [col.strip('"') for col in match.group(2).split(", ")] 86 if current_table not in table_cols: 87 table_cols[current_table] = [columns] 88 print(f"table: {current_table} columns: {columns}") 89 start = match.end() 90 end = min( 91 content.find(start_of_insert, start), 92 content.find(start_of_table, start), 93 ) 94 end = end if end != -1 else len(content) 95 print(f"insert from {start} to {end}, length: {end-start}") 96 print("Breaking insert into row strings...") 97 insert_txt = content[start:end] 98 row_strs = list(map(lambda x: f"({x})", insert_txt.split(row_separator))) 99 print(f"Found {len(row_strs)} rows") 100 101 print("Parsing SQL rows...") 102 if current_table not in table_rows: 103 table_rows[current_table] = [] 104 105 current_rows = [] 106 for i, row_str in enumerate(row_strs, 1): 107 parsed_values = parse_sql_values(row_str) 108 if len(parsed_values) == len(columns): 109 current_rows.append(parsed_values) 110 else: 111 print( 112 f"Error: Row {i} in table {current_table} has {len(parsed_values)} values, expected {len(columns)}" 113 ) 114 if i % 10000 == 0: 115 print(f"Parsed {i} rows") 116 table_rows[current_table].extend(current_rows) 117 del current_rows 118 gc.collect() 119 120 print( 121 f"Table {current_table} now has {len(table_rows[current_table])} rows" 122 ) 123 124 # take the table_rows and create dataframes and write them 125 import os 126 127 parent_dir = os.path.dirname(output_dir) 128 if parent_dir: 129 os.makedirs(parent_dir, exist_ok=True) 130 os.makedirs(output_dir, exist_ok=True) 131 132 for table in table_rows: 133 df = pd.DataFrame(table_rows[table], columns=table_cols[table]) 134 df.to_csv(f"{output_dir}/{table}.csv", index=False) 135 del df 136 gc.collect() 137 138 139import os 140 141PROJECT_ROOT = os.path.join(os.path.dirname(__file__), "../..") 142input_file = f"{PROJECT_ROOT}/supabase/schema.sql" 143output_dir = f"{PROJECT_ROOT}/dumps/big1" 144# %% 145# Usage 146 147# process_dump(input_file, output_dir) 148# %% 149insertions_str = """ 150-- 151-- Data for Name: likes; Type: TABLE DATA; Schema: public; Owner: postgres 152-- 153 154INSERT INTO "public"."likes" ("id", "account_id", "liked_tweet_id", "archive_upload_id") OVERRIDING SYSTEM VALUE VALUES 155 (747024, '1796120648281923584', '1826310911050629401', 17), 156 (747025, '1796120648281923584', '1826257777892983255', 17), 157 (747026, '1796120648281923584', '1826204870694916186', 17), 158 (747027, '1796120648281923584', '1826296964881027483', 17), 159 (747028, '1796120648281923584', '1825774665144340827', 17), 160 (747029, '1796120648281923584', '1826024905227244030', 17), 161 (747030, '1796120648281923584', '1826139046483013948', 17), 162 (747031, '1796120648281923584', '1826078098481361307', 17), 163 (747032, '1796120648281923584', '1826282730193318175', 17), 164 (747033, '1796120648281923584', '1826271289017028776', 17), 165 (747034, '1796120648281923584', '1826266593950564355', 17), 166 (747035, '1796120648281923584', '1826265722634502301', 17), 167 (747036, '1796120648281923584', '1826264433578148064', 17), 168 (747037, '1796120648281923584', '1825928781367791868', 17), 169 (747038, '1796120648281923584', '1826257231786242482', 17); 170 171 172 173-- 174-- Data for Name: mentioned_users; Type: TABLE DATA; Schema: public; Owner: postgres 175-- 176 177INSERT INTO "public"."mentioned_users" ("user_id", "name", "screen_name", "updated_at") VALUES 178 ('29961293', 'Geger Riyanto', 'gegerriy', '2024-09-05 18:06:03.635+00'), 179 ('15633111', 'Mason Currey', 'masoncurrey', '2024-09-04 15:25:09.757+00'), 180 ('44833706', 'synaptic stimuli', 'lindaraharja', '2024-09-05 18:06:03.634+00'), 181 ('10545', 'Mike Rundle', 'flyosity', '2024-09-03 11:52:44.385+00'), 182 ('21195588', 'Tim Pastoor', 'timpastoor', '2024-09-04 15:25:09.757+00'), 183 ('46991136', 'Gauchienne', 'gaucheian', '2024-09-05 18:06:03.637+00'), 184 ('24490384', 'Damian', 'BEEPLEofWALMART', '2024-09-04 15:25:09.757+00'), 185 ('182383', 'Ben Gold', 'bengold', '2024-09-06 18:29:53.843+00'), 186 ('53044984', 'Harbowoputra', 'harbowoputra', '2024-09-05 18:06:03.633+00'), 187 ('57816604', 'Maybe: Hegar', 'HPEgieara', '2024-09-05 18:06:03.636+00'), 188 ('74731503', 'Liam 🔻', 'cluelessdirectr', '2024-09-05 18:06:03.635+00'), 189 ('93421683', 'Yihui is returning to self', 'empirepowder', '2024-09-05 18:06:03.635+00'), 190 ('1106554797879119872', 'Victor', 'notnaughtknot', '2024-07-14 20:59:36+00'), 191 ('70894158', 'Ryan Abel', 'GeneralAntilles', '2024-09-04 15:25:09.758+00'), 192 ('110451384', 'Matt S "unpredictably hypergolic" Trout (mst)', 'shadowcat_mst', '2024-09-04 15:25:09.756+00'), 193 ('122484263', 'Urban Composition', 'urban_comp', '2024-09-04 15:25:09.758+00'), 194 ('100686498', 'Winda', 'windaul', '2024-09-05 18:06:03.634+00'); 195 196 197INSERT INTO "public"."mentioned_users" ("user_id", "name", "screen_name", "updated_at") VALUES 198 ('29961293', 'Geger Riyanto', 'gegerriy', '2024-09-05 18:06:03.635+00'), 199 ('15633111', 'Mason Currey', 'masoncurrey', '2024-09-04 15:25:09.757+00'), 200 ('44833706', 'synaptic stimuli', 'lindaraharja', '2024-09-05 18:06:03.634+00'), 201 ('10545', 'Mike Rundle', 'flyosity', '2024-09-03 11:52:44.385+00'), 202 ('21195588', 'Tim Pastoor', 'timpastoor', '2024-09-04 15:25:09.757+00'), 203 ('46991136', 'Gauchienne', 'gaucheian', '2024-09-05 18:06:03.637+00'), 204 ('24490384', 'Damian', 'BEEPLEofWALMART', '2024-09-04 15:25:09.757+00'), 205 ('182383', 'Ben Gold', 'bengold', '2024-09-06 18:29:53.843+00'), 206 ('53044984', 'Harbowoputra', 'harbowoputra', '2024-09-05 18:06:03.633+00'), 207 ('57816604', 'Maybe: Hegar', 'HPEgieara', '2024-09-05 18:06:03.636+00'), 208 ('74731503', 'Liam 🔻', 'cluelessdirectr', '2024-09-05 18:06:03.635+00'), 209 ('93421683', 'Yihui is returning to self', 'empirepowder', '2024-09-05 18:06:03.635+00'), 210 ('1106554797879119872', 'Victor', 'notnaughtknot', '2024-07-14 20:59:36+00'), 211 ('70894158', 'Ryan Abel', 'GeneralAntilles', '2024-09-04 15:25:09.758+00'), 212 ('110451384', 'Matt S "unpredictably hypergolic" Trout (mst)', 'shadowcat_mst', '2024-09-04 15:25:09.756+00'), 213 ('122484263', 'Urban Composition', 'urban_comp', '2024-09-04 15:25:09.758+00'), 214 ('100686498', 'Winda', 'windaul', '2024-09-05 18:06:03.634+00'); 215""" 216 217rows_str = """('1823552071452242220', 'The plan isn''t that important in the end. But the plan allows you to take action & the action will generate better plans. This feedback loop is the important part. 218 219You just have to start somewhere & dive in. https://t.co/I2nnM2rIHT'), 220 ('1823671598911463631', '@_samand_ @So8res curiously this is a great relationship advice'), 221 ('1823552064363938254', 'there is a blog post called by @So8res that helped me develop more agency & take imperfect action toward a goal 222 223the main takeaway is if you''re ambitious and want to do something important, you have to get your hands dirty and try things https://t.co/YzgNt4sQuM'), 224 ('1823552068868604085', 'Alice''s plan is bad. But it''s better than Bob''s. 225 226Why? Because Alice will be in the arena _trying things_. 227 228Alice will be out there "bumping into the world" — I love this phrase. I think about it all the time. I want to bump into things!!! https://t.co/GeFbBrrv8d'), 229 ('1823478439875060100', 'the problem is not that insight is hard to get— the problem is it’s fucking scary'), 230 ('1823626245348688380', 'this portal retrospective ain''t gonna write itself folks'), 231 ('1823632275453469080', 'omg i need to leave my job NOW i''m suffering'), 232 ('1823655030404116482', '@nido_kween @exgenesis 😂😂😂'), 233 ('1823654543067865491', '@TheJointleman @exgenesis same');""" 234 235if __name__ == "__main__": 236 import argparse 237 238 parser = argparse.ArgumentParser(description="Process SQL dump") 239 parser.add_argument("input_file", help="Path to the input SQL dump file") 240 parser.add_argument("output_file", help="Path to the output processed file") 241 args = parser.parse_args() 242 243 # Use args.input_file and args.output_file in your main logic 244 process_dump(args.input_file, args.output_file)