-202
json_parser.go
-202
json_parser.go
···
1
-
package main
2
-
3
-
import (
4
-
"encoding/json"
5
-
"fmt"
6
-
"regexp"
7
-
"strings"
8
-
"time"
9
-
10
-
"github.com/PuerkitoBio/goquery"
11
-
)
12
-
13
-
// NextData represents the structure of the __NEXT_DATA__ JSON
14
-
type NextData struct {
15
-
Props struct {
16
-
PageProps struct {
17
-
LiveTrainsState struct {
18
-
Queries []struct {
19
-
State struct {
20
-
Data struct {
21
-
Pages []struct {
22
-
Services []JSONService `json:"services"`
23
-
} `json:"pages"`
24
-
} `json:"data"`
25
-
} `json:"state"`
26
-
} `json:"queries"`
27
-
} `json:"liveTrainsState"`
28
-
} `json:"pageProps"`
29
-
} `json:"props"`
30
-
}
31
-
32
-
type JSONService struct {
33
-
RID string `json:"rid"`
34
-
TrainUID string `json:"trainUid"`
35
-
Origin []struct {
36
-
LocationName string `json:"locationName"`
37
-
CRS string `json:"crs"`
38
-
Via *string `json:"via"`
39
-
} `json:"origin"`
40
-
Destination []struct {
41
-
LocationName string `json:"locationName"`
42
-
CRS string `json:"crs"`
43
-
Via *string `json:"via"`
44
-
} `json:"destination"`
45
-
JourneyDetails struct {
46
-
From struct {
47
-
LocationName string `json:"locationName"`
48
-
CRS string `json:"crs"`
49
-
} `json:"from"`
50
-
To struct {
51
-
LocationName string `json:"locationName"`
52
-
CRS string `json:"crs"`
53
-
} `json:"to"`
54
-
Stops int `json:"stops"`
55
-
DepartureInfo struct {
56
-
Scheduled string `json:"scheduled"`
57
-
Estimated *string `json:"estimated"`
58
-
Actual *string `json:"actual"`
59
-
} `json:"departureInfo"`
60
-
ArrivalInfo struct {
61
-
Scheduled string `json:"scheduled"`
62
-
Estimated *string `json:"estimated"`
63
-
Actual *string `json:"actual"`
64
-
} `json:"arrivalInfo"`
65
-
} `json:"journeyDetails"`
66
-
Operator struct {
67
-
Name string `json:"name"`
68
-
Code string `json:"code"`
69
-
} `json:"operator"`
70
-
Status struct {
71
-
Status string `json:"status"`
72
-
DelayReason *string `json:"delayReason"`
73
-
CancelReason *string `json:"cancelReason"`
74
-
} `json:"status"`
75
-
Platform string `json:"platform"`
76
-
}
77
-
78
-
// ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON
79
-
func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) {
80
-
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
81
-
if err != nil {
82
-
return nil, fmt.Errorf("failed to parse HTML: %w", err)
83
-
}
84
-
85
-
// Find the __NEXT_DATA__ script tag
86
-
var jsonContent string
87
-
doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) {
88
-
rawContent := s.Text()
89
-
jsonContent = cleanJSONString(rawContent)
90
-
})
91
-
92
-
if jsonContent == "" {
93
-
return nil, fmt.Errorf("no __NEXT_DATA__ script tag found")
94
-
}
95
-
96
-
// Parse the JSON
97
-
var nextData NextData
98
-
err = json.Unmarshal([]byte(jsonContent), &nextData)
99
-
if err != nil {
100
-
return nil, fmt.Errorf("failed to parse JSON: %w", err)
101
-
}
102
-
103
-
// Extract services from the nested structure
104
-
var departures []Departure
105
-
if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 &&
106
-
len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 {
107
-
108
-
services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services
109
-
110
-
for _, service := range services {
111
-
departure := convertJSONServiceToDeparture(service)
112
-
departures = append(departures, departure)
113
-
}
114
-
}
115
-
116
-
return departures, nil
117
-
}
118
-
119
-
func convertJSONServiceToDeparture(service JSONService) Departure {
120
-
departure := Departure{
121
-
ServiceID: service.RID,
122
-
Platform: service.Platform,
123
-
Stops: service.JourneyDetails.Stops,
124
-
Operator: service.Operator.Name,
125
-
}
126
-
127
-
// Extract destination
128
-
if len(service.Destination) > 0 {
129
-
departure.Destination = service.Destination[0].LocationName
130
-
if service.Destination[0].Via != nil && *service.Destination[0].Via != "" {
131
-
departure.Via = *service.Destination[0].Via
132
-
}
133
-
}
134
-
135
-
// Parse scheduled time
136
-
if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil {
137
-
departure.ScheduledTime = scheduledTime.Format("15:04")
138
-
}
139
-
140
-
// Parse estimated/actual time and set status
141
-
if service.JourneyDetails.DepartureInfo.Actual != nil {
142
-
// Train has already departed
143
-
if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil {
144
-
departure.ExpectedTime = actualTime.Format("15:04")
145
-
departure.Status = "Departed " + departure.ExpectedTime
146
-
}
147
-
} else if service.JourneyDetails.DepartureInfo.Estimated != nil {
148
-
// Train has estimated departure time
149
-
if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil {
150
-
departure.ExpectedTime = estimatedTime.Format("15:04")
151
-
if departure.ExpectedTime == departure.ScheduledTime {
152
-
departure.Status = "On time"
153
-
} else {
154
-
departure.Status = "Expected " + departure.ExpectedTime
155
-
}
156
-
}
157
-
} else {
158
-
// Use service status
159
-
switch service.Status.Status {
160
-
case "OnTime":
161
-
departure.Status = "On time"
162
-
case "Late":
163
-
departure.Status = "Late"
164
-
case "Cancelled":
165
-
departure.Status = "Cancelled"
166
-
default:
167
-
departure.Status = service.Status.Status
168
-
}
169
-
}
170
-
171
-
// Calculate duration (rough estimate)
172
-
if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil {
173
-
if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil {
174
-
duration := scheduledArr.Sub(scheduledDep)
175
-
departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes()))
176
-
}
177
-
}
178
-
179
-
// Add delay reason if available
180
-
if service.Status.DelayReason != nil && *service.Status.DelayReason != "" {
181
-
departure.DelayReason = *service.Status.DelayReason
182
-
}
183
-
184
-
return departure
185
-
}
186
-
187
-
// cleanJSONString fixes common JSON issues from HTML extraction
188
-
func cleanJSONString(s string) string {
189
-
s = strings.TrimSpace(s)
190
-
191
-
// Fix newlines within string literals
192
-
// This is a simplified approach - we replace newlines within quoted strings
193
-
re := regexp.MustCompile(`"([^"]*\n[^"]*)"`)
194
-
s = re.ReplaceAllStringFunc(s, func(match string) string {
195
-
// Remove quotes, replace newlines with spaces, add quotes back
196
-
inner := match[1 : len(match)-1] // Remove surrounding quotes
197
-
inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ")
198
-
return `"` + inner + `"`
199
-
})
200
-
201
-
return s
202
-
}
···
+195
-1
parser.go
+195
-1
parser.go
···
1
package main
2
3
import (
4
+
"encoding/json"
5
"fmt"
6
"os"
7
+
"regexp"
8
"strings"
9
+
"time"
10
11
"github.com/PuerkitoBio/goquery"
12
)
···
80
}
81
82
return ParseDeparturesFromHTML(string(content))
83
+
}
84
+
85
+
// NextData represents the structure of the __NEXT_DATA__ JSON
86
+
type NextData struct {
87
+
Props struct {
88
+
PageProps struct {
89
+
LiveTrainsState struct {
90
+
Queries []struct {
91
+
State struct {
92
+
Data struct {
93
+
Pages []struct {
94
+
Services []JSONService `json:"services"`
95
+
} `json:"pages"`
96
+
} `json:"data"`
97
+
} `json:"state"`
98
+
} `json:"queries"`
99
+
} `json:"liveTrainsState"`
100
+
} `json:"pageProps"`
101
+
} `json:"props"`
102
+
}
103
+
104
+
type JSONService struct {
105
+
RID string `json:"rid"`
106
+
TrainUID string `json:"trainUid"`
107
+
Origin []struct {
108
+
LocationName string `json:"locationName"`
109
+
CRS string `json:"crs"`
110
+
Via *string `json:"via"`
111
+
} `json:"origin"`
112
+
Destination []struct {
113
+
LocationName string `json:"locationName"`
114
+
CRS string `json:"crs"`
115
+
Via *string `json:"via"`
116
+
} `json:"destination"`
117
+
JourneyDetails struct {
118
+
From struct {
119
+
LocationName string `json:"locationName"`
120
+
CRS string `json:"crs"`
121
+
} `json:"from"`
122
+
To struct {
123
+
LocationName string `json:"locationName"`
124
+
CRS string `json:"crs"`
125
+
} `json:"to"`
126
+
Stops int `json:"stops"`
127
+
DepartureInfo struct {
128
+
Scheduled string `json:"scheduled"`
129
+
Estimated *string `json:"estimated"`
130
+
Actual *string `json:"actual"`
131
+
} `json:"departureInfo"`
132
+
ArrivalInfo struct {
133
+
Scheduled string `json:"scheduled"`
134
+
Estimated *string `json:"estimated"`
135
+
Actual *string `json:"actual"`
136
+
} `json:"arrivalInfo"`
137
+
} `json:"journeyDetails"`
138
+
Operator struct {
139
+
Name string `json:"name"`
140
+
Code string `json:"code"`
141
+
} `json:"operator"`
142
+
Status struct {
143
+
Status string `json:"status"`
144
+
DelayReason *string `json:"delayReason"`
145
+
CancelReason *string `json:"cancelReason"`
146
+
} `json:"status"`
147
+
Platform string `json:"platform"`
148
+
}
149
+
150
+
// ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON
151
+
func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) {
152
+
doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
153
+
if err != nil {
154
+
return nil, fmt.Errorf("failed to parse HTML: %w", err)
155
+
}
156
+
157
+
// Find the __NEXT_DATA__ script tag
158
+
var jsonContent string
159
+
doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) {
160
+
rawContent := s.Text()
161
+
jsonContent = cleanJSONString(rawContent)
162
+
})
163
+
164
+
if jsonContent == "" {
165
+
return nil, fmt.Errorf("no __NEXT_DATA__ script tag found")
166
+
}
167
+
168
+
// Parse the JSON
169
+
var nextData NextData
170
+
err = json.Unmarshal([]byte(jsonContent), &nextData)
171
+
if err != nil {
172
+
return nil, fmt.Errorf("failed to parse JSON: %w", err)
173
+
}
174
+
175
+
// Extract services from the nested structure
176
+
var departures []Departure
177
+
if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 &&
178
+
len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 {
179
+
180
+
services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services
181
+
182
+
for _, service := range services {
183
+
departure := convertJSONServiceToDeparture(service)
184
+
departures = append(departures, departure)
185
+
}
186
+
}
187
+
188
+
return departures, nil
189
+
}
190
+
191
+
func convertJSONServiceToDeparture(service JSONService) Departure {
192
+
departure := Departure{
193
+
ServiceID: service.RID,
194
+
Platform: service.Platform,
195
+
Stops: service.JourneyDetails.Stops,
196
+
Operator: service.Operator.Name,
197
+
}
198
+
199
+
// Extract destination
200
+
if len(service.Destination) > 0 {
201
+
departure.Destination = service.Destination[0].LocationName
202
+
if service.Destination[0].Via != nil && *service.Destination[0].Via != "" {
203
+
departure.Via = *service.Destination[0].Via
204
+
}
205
+
}
206
+
207
+
// Parse scheduled time
208
+
if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil {
209
+
departure.ScheduledTime = scheduledTime.Format("15:04")
210
+
}
211
+
212
+
// Parse estimated/actual time and set status
213
+
if service.JourneyDetails.DepartureInfo.Actual != nil {
214
+
// Train has already departed
215
+
if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil {
216
+
departure.ExpectedTime = actualTime.Format("15:04")
217
+
departure.Status = "Departed " + departure.ExpectedTime
218
+
}
219
+
} else if service.JourneyDetails.DepartureInfo.Estimated != nil {
220
+
// Train has estimated departure time
221
+
if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil {
222
+
departure.ExpectedTime = estimatedTime.Format("15:04")
223
+
if departure.ExpectedTime == departure.ScheduledTime {
224
+
departure.Status = "On time"
225
+
} else {
226
+
departure.Status = "Expected " + departure.ExpectedTime
227
+
}
228
+
}
229
+
} else {
230
+
// Use service status
231
+
switch service.Status.Status {
232
+
case "OnTime":
233
+
departure.Status = "On time"
234
+
case "Late":
235
+
departure.Status = "Late"
236
+
case "Cancelled":
237
+
departure.Status = "Cancelled"
238
+
default:
239
+
departure.Status = service.Status.Status
240
+
}
241
+
}
242
+
243
+
// Calculate duration (rough estimate)
244
+
if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil {
245
+
if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil {
246
+
duration := scheduledArr.Sub(scheduledDep)
247
+
departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes()))
248
+
}
249
+
}
250
+
251
+
// Add delay reason if available
252
+
if service.Status.DelayReason != nil && *service.Status.DelayReason != "" {
253
+
departure.DelayReason = *service.Status.DelayReason
254
+
}
255
+
256
+
return departure
257
+
}
258
+
259
+
// cleanJSONString fixes common JSON issues from HTML extraction
260
+
func cleanJSONString(s string) string {
261
+
s = strings.TrimSpace(s)
262
+
263
+
// Fix newlines within string literals
264
+
// This is a simplified approach - we replace newlines within quoted strings
265
+
re := regexp.MustCompile(`"([^"]*\n[^"]*)"`)
266
+
s = re.ReplaceAllStringFunc(s, func(match string) string {
267
+
// Remove quotes, replace newlines with spaces, add quotes back
268
+
inner := match[1 : len(match)-1] // Remove surrounding quotes
269
+
inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ")
270
+
return `"` + inner + `"`
271
+
})
272
+
273
+
return s
274
+
}