this repo has no description
1# %%
2import gc
3import re
4import csv
5import pandas as pd
6import io
7
8public_tables = [
9 "account",
10 "archive_upload",
11 "followers",
12 "liked_tweets",
13 "likes",
14 "mentioned_users",
15 "profile",
16 "tweets",
17 "tweet_media",
18 "tweet_urls",
19 "user_mentions",
20]
21
22
23insertion_separator = """);
24"""
25
26row_separator = """),
27 ("""
28
29
30from functools import reduce
31
32
33def parse_sql_values(line):
34 def process_char(state, char):
35 values, current_value, in_quotes, parentheses_count = state
36
37 if char == "(" and not in_quotes:
38 return (values, current_value, in_quotes, parentheses_count + 1)
39 elif char == ")" and not in_quotes:
40 if parentheses_count == 1:
41 return (values + [current_value.strip()], "", in_quotes, 0)
42 return (values, current_value, in_quotes, parentheses_count - 1)
43 elif char == "'" and not in_quotes:
44 return (values, current_value, True, parentheses_count)
45 elif char == "'" and in_quotes:
46 if current_value.endswith("'"):
47 return (values, current_value + char, in_quotes, parentheses_count)
48 return (values, current_value, False, parentheses_count)
49 elif char == "," and not in_quotes and parentheses_count == 1:
50 return (values + [current_value.strip()], "", in_quotes, parentheses_count)
51 else:
52 return (values, current_value + char, in_quotes, parentheses_count)
53
54 initial_state = ([], "", False, 0)
55 final_state = reduce(process_char, line.strip(), initial_state)
56 return [v.strip("'").replace("''", "'") for v in final_state[0]]
57
58
59start_of_insert = """INSERT INTO "public"."""
60start_of_table = """
61--
62-- Data for Name:"""
63
64row_separator = """),
65 ("""
66row_separator = """),\n\t("""
67
68
69def process_dump(input_file, output_dir):
70 current_table = None
71 current_table = None
72 table_cols = {}
73 table_rows = {}
74
75 insert_regex = re.compile(
76 r'INSERT INTO "public"\."(\w+)" \((.*?)\)(?: OVERRIDING SYSTEM VALUE)? VALUES'
77 )
78
79 with open(input_file, "r", encoding="utf-8") as f:
80 content = f.read()
81 insert_iter = insert_regex.finditer(content)
82 for match in insert_iter:
83
84 current_table = match.group(1)
85 columns = [col.strip('"') for col in match.group(2).split(", ")]
86 if current_table not in table_cols:
87 table_cols[current_table] = [columns]
88 print(f"table: {current_table} columns: {columns}")
89 start = match.end()
90 end = min(
91 content.find(start_of_insert, start),
92 content.find(start_of_table, start),
93 )
94 end = end if end != -1 else len(content)
95 print(f"insert from {start} to {end}, length: {end-start}")
96 print("Breaking insert into row strings...")
97 insert_txt = content[start:end]
98 row_strs = list(map(lambda x: f"({x})", insert_txt.split(row_separator)))
99 print(f"Found {len(row_strs)} rows")
100
101 print("Parsing SQL rows...")
102 if current_table not in table_rows:
103 table_rows[current_table] = []
104
105 current_rows = []
106 for i, row_str in enumerate(row_strs, 1):
107 parsed_values = parse_sql_values(row_str)
108 if len(parsed_values) == len(columns):
109 current_rows.append(parsed_values)
110 else:
111 print(
112 f"Error: Row {i} in table {current_table} has {len(parsed_values)} values, expected {len(columns)}"
113 )
114 if i % 10000 == 0:
115 print(f"Parsed {i} rows")
116 table_rows[current_table].extend(current_rows)
117 del current_rows
118 gc.collect()
119
120 print(
121 f"Table {current_table} now has {len(table_rows[current_table])} rows"
122 )
123
124 # take the table_rows and create dataframes and write them
125 import os
126
127 parent_dir = os.path.dirname(output_dir)
128 if parent_dir:
129 os.makedirs(parent_dir, exist_ok=True)
130 os.makedirs(output_dir, exist_ok=True)
131
132 for table in table_rows:
133 df = pd.DataFrame(table_rows[table], columns=table_cols[table])
134 df.to_csv(f"{output_dir}/{table}.csv", index=False)
135 del df
136 gc.collect()
137
138
139import os
140
141PROJECT_ROOT = os.path.join(os.path.dirname(__file__), "../..")
142input_file = f"{PROJECT_ROOT}/supabase/schema.sql"
143output_dir = f"{PROJECT_ROOT}/dumps/big1"
144# %%
145# Usage
146
147# process_dump(input_file, output_dir)
148# %%
149insertions_str = """
150--
151-- Data for Name: likes; Type: TABLE DATA; Schema: public; Owner: postgres
152--
153
154INSERT INTO "public"."likes" ("id", "account_id", "liked_tweet_id", "archive_upload_id") OVERRIDING SYSTEM VALUE VALUES
155 (747024, '1796120648281923584', '1826310911050629401', 17),
156 (747025, '1796120648281923584', '1826257777892983255', 17),
157 (747026, '1796120648281923584', '1826204870694916186', 17),
158 (747027, '1796120648281923584', '1826296964881027483', 17),
159 (747028, '1796120648281923584', '1825774665144340827', 17),
160 (747029, '1796120648281923584', '1826024905227244030', 17),
161 (747030, '1796120648281923584', '1826139046483013948', 17),
162 (747031, '1796120648281923584', '1826078098481361307', 17),
163 (747032, '1796120648281923584', '1826282730193318175', 17),
164 (747033, '1796120648281923584', '1826271289017028776', 17),
165 (747034, '1796120648281923584', '1826266593950564355', 17),
166 (747035, '1796120648281923584', '1826265722634502301', 17),
167 (747036, '1796120648281923584', '1826264433578148064', 17),
168 (747037, '1796120648281923584', '1825928781367791868', 17),
169 (747038, '1796120648281923584', '1826257231786242482', 17);
170
171
172
173--
174-- Data for Name: mentioned_users; Type: TABLE DATA; Schema: public; Owner: postgres
175--
176
177INSERT INTO "public"."mentioned_users" ("user_id", "name", "screen_name", "updated_at") VALUES
178 ('29961293', 'Geger Riyanto', 'gegerriy', '2024-09-05 18:06:03.635+00'),
179 ('15633111', 'Mason Currey', 'masoncurrey', '2024-09-04 15:25:09.757+00'),
180 ('44833706', 'synaptic stimuli', 'lindaraharja', '2024-09-05 18:06:03.634+00'),
181 ('10545', 'Mike Rundle', 'flyosity', '2024-09-03 11:52:44.385+00'),
182 ('21195588', 'Tim Pastoor', 'timpastoor', '2024-09-04 15:25:09.757+00'),
183 ('46991136', 'Gauchienne', 'gaucheian', '2024-09-05 18:06:03.637+00'),
184 ('24490384', 'Damian', 'BEEPLEofWALMART', '2024-09-04 15:25:09.757+00'),
185 ('182383', 'Ben Gold', 'bengold', '2024-09-06 18:29:53.843+00'),
186 ('53044984', 'Harbowoputra', 'harbowoputra', '2024-09-05 18:06:03.633+00'),
187 ('57816604', 'Maybe: Hegar', 'HPEgieara', '2024-09-05 18:06:03.636+00'),
188 ('74731503', 'Liam 🔻', 'cluelessdirectr', '2024-09-05 18:06:03.635+00'),
189 ('93421683', 'Yihui is returning to self', 'empirepowder', '2024-09-05 18:06:03.635+00'),
190 ('1106554797879119872', 'Victor', 'notnaughtknot', '2024-07-14 20:59:36+00'),
191 ('70894158', 'Ryan Abel', 'GeneralAntilles', '2024-09-04 15:25:09.758+00'),
192 ('110451384', 'Matt S "unpredictably hypergolic" Trout (mst)', 'shadowcat_mst', '2024-09-04 15:25:09.756+00'),
193 ('122484263', 'Urban Composition', 'urban_comp', '2024-09-04 15:25:09.758+00'),
194 ('100686498', 'Winda', 'windaul', '2024-09-05 18:06:03.634+00');
195
196
197INSERT INTO "public"."mentioned_users" ("user_id", "name", "screen_name", "updated_at") VALUES
198 ('29961293', 'Geger Riyanto', 'gegerriy', '2024-09-05 18:06:03.635+00'),
199 ('15633111', 'Mason Currey', 'masoncurrey', '2024-09-04 15:25:09.757+00'),
200 ('44833706', 'synaptic stimuli', 'lindaraharja', '2024-09-05 18:06:03.634+00'),
201 ('10545', 'Mike Rundle', 'flyosity', '2024-09-03 11:52:44.385+00'),
202 ('21195588', 'Tim Pastoor', 'timpastoor', '2024-09-04 15:25:09.757+00'),
203 ('46991136', 'Gauchienne', 'gaucheian', '2024-09-05 18:06:03.637+00'),
204 ('24490384', 'Damian', 'BEEPLEofWALMART', '2024-09-04 15:25:09.757+00'),
205 ('182383', 'Ben Gold', 'bengold', '2024-09-06 18:29:53.843+00'),
206 ('53044984', 'Harbowoputra', 'harbowoputra', '2024-09-05 18:06:03.633+00'),
207 ('57816604', 'Maybe: Hegar', 'HPEgieara', '2024-09-05 18:06:03.636+00'),
208 ('74731503', 'Liam 🔻', 'cluelessdirectr', '2024-09-05 18:06:03.635+00'),
209 ('93421683', 'Yihui is returning to self', 'empirepowder', '2024-09-05 18:06:03.635+00'),
210 ('1106554797879119872', 'Victor', 'notnaughtknot', '2024-07-14 20:59:36+00'),
211 ('70894158', 'Ryan Abel', 'GeneralAntilles', '2024-09-04 15:25:09.758+00'),
212 ('110451384', 'Matt S "unpredictably hypergolic" Trout (mst)', 'shadowcat_mst', '2024-09-04 15:25:09.756+00'),
213 ('122484263', 'Urban Composition', 'urban_comp', '2024-09-04 15:25:09.758+00'),
214 ('100686498', 'Winda', 'windaul', '2024-09-05 18:06:03.634+00');
215"""
216
217rows_str = """('1823552071452242220', 'The plan isn''t that important in the end. But the plan allows you to take action & the action will generate better plans. This feedback loop is the important part.
218
219You just have to start somewhere & dive in. https://t.co/I2nnM2rIHT'),
220 ('1823671598911463631', '@_samand_ @So8res curiously this is a great relationship advice'),
221 ('1823552064363938254', 'there is a blog post called by @So8res that helped me develop more agency & take imperfect action toward a goal
222
223the main takeaway is if you''re ambitious and want to do something important, you have to get your hands dirty and try things https://t.co/YzgNt4sQuM'),
224 ('1823552068868604085', 'Alice''s plan is bad. But it''s better than Bob''s.
225
226Why? Because Alice will be in the arena _trying things_.
227
228Alice will be out there "bumping into the world" — I love this phrase. I think about it all the time. I want to bump into things!!! https://t.co/GeFbBrrv8d'),
229 ('1823478439875060100', 'the problem is not that insight is hard to get— the problem is it’s fucking scary'),
230 ('1823626245348688380', 'this portal retrospective ain''t gonna write itself folks'),
231 ('1823632275453469080', 'omg i need to leave my job NOW i''m suffering'),
232 ('1823655030404116482', '@nido_kween @exgenesis 😂😂😂'),
233 ('1823654543067865491', '@TheJointleman @exgenesis same');"""
234
235if __name__ == "__main__":
236 import argparse
237
238 parser = argparse.ArgumentParser(description="Process SQL dump")
239 parser.add_argument("input_file", help="Path to the input SQL dump file")
240 parser.add_argument("output_file", help="Path to the output processed file")
241 args = parser.parse_args()
242
243 # Use args.input_file and args.output_file in your main logic
244 process_dump(args.input_file, args.output_file)