1package main
2
3import (
4 "encoding/json"
5 "fmt"
6 "os"
7 "regexp"
8 "strings"
9 "time"
10
11 "github.com/PuerkitoBio/goquery"
12)
13
14// ParseDeparturesFromHTML extracts departure information from the National Rail HTML
15func ParseDeparturesFromHTML(htmlContent string) ([]Departure, error) {
16 // Try JSON approach first (for new React-based site)
17 if departures, err := ParseDeparturesFromJSONHTML(htmlContent); err == nil && len(departures) > 0 {
18 return departures, nil
19 }
20
21 // Fallback to old HTML parsing approach
22 return parseFromAriaLabels(htmlContent)
23}
24
25func parseFromAriaLabels(htmlContent string) ([]Departure, error) {
26 doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
27 if err != nil {
28 return nil, fmt.Errorf("failed to parse HTML: %w", err)
29 }
30
31 seenServices := make(map[string]bool)
32 var departures []Departure
33
34 // Find all anchor tags with aria-label containing service information
35 doc.Find("a[aria-label*='service for']").Each(func(i int, s *goquery.Selection) {
36 ariaLabel, exists := s.Attr("aria-label")
37 if !exists {
38 return
39 }
40
41 // Extract service ID from href if available
42 href, _ := s.Attr("href")
43 serviceID := extractServiceID(href)
44
45 // Skip if we've already processed this service
46 if seenServices[serviceID] {
47 return
48 }
49 seenServices[serviceID] = true
50
51 departure, err := ParseAriaLabel(ariaLabel)
52 if err != nil || departure == nil {
53 return
54 }
55
56 departure.ServiceID = serviceID
57 departures = append(departures, *departure)
58 })
59
60 return departures, nil
61}
62
63// extractServiceID extracts the service ID from the href URL
64func extractServiceID(href string) string {
65 if strings.Contains(href, "sid=") {
66 parts := strings.Split(href, "sid=")
67 if len(parts) > 1 {
68 sidPart := strings.Split(parts[1], "&")[0]
69 return sidPart
70 }
71 }
72 return ""
73}
74
75// ParseDeparturesFromFile reads an HTML file and extracts departure information
76func ParseDeparturesFromFile(filename string) ([]Departure, error) {
77 content, err := os.ReadFile(filename)
78 if err != nil {
79 return nil, fmt.Errorf("failed to read file %s: %w", filename, err)
80 }
81
82 return ParseDeparturesFromHTML(string(content))
83}
84
85// NextData represents the structure of the __NEXT_DATA__ JSON
86type NextData struct {
87 Props struct {
88 PageProps struct {
89 LiveTrainsState struct {
90 Queries []struct {
91 State struct {
92 Data struct {
93 Pages []struct {
94 Services []JSONService `json:"services"`
95 } `json:"pages"`
96 } `json:"data"`
97 } `json:"state"`
98 } `json:"queries"`
99 } `json:"liveTrainsState"`
100 } `json:"pageProps"`
101 } `json:"props"`
102}
103
104type JSONService struct {
105 RID string `json:"rid"`
106 TrainUID string `json:"trainUid"`
107 Origin []struct {
108 LocationName string `json:"locationName"`
109 CRS string `json:"crs"`
110 Via *string `json:"via"`
111 } `json:"origin"`
112 Destination []struct {
113 LocationName string `json:"locationName"`
114 CRS string `json:"crs"`
115 Via *string `json:"via"`
116 } `json:"destination"`
117 JourneyDetails struct {
118 From struct {
119 LocationName string `json:"locationName"`
120 CRS string `json:"crs"`
121 } `json:"from"`
122 To struct {
123 LocationName string `json:"locationName"`
124 CRS string `json:"crs"`
125 } `json:"to"`
126 Stops int `json:"stops"`
127 DepartureInfo struct {
128 Scheduled string `json:"scheduled"`
129 Estimated *string `json:"estimated"`
130 Actual *string `json:"actual"`
131 } `json:"departureInfo"`
132 ArrivalInfo struct {
133 Scheduled string `json:"scheduled"`
134 Estimated *string `json:"estimated"`
135 Actual *string `json:"actual"`
136 } `json:"arrivalInfo"`
137 } `json:"journeyDetails"`
138 Operator struct {
139 Name string `json:"name"`
140 Code string `json:"code"`
141 } `json:"operator"`
142 Status struct {
143 Status string `json:"status"`
144 DelayReason *string `json:"delayReason"`
145 CancelReason *string `json:"cancelReason"`
146 } `json:"status"`
147 Platform string `json:"platform"`
148}
149
150// ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON
151func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) {
152 doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent))
153 if err != nil {
154 return nil, fmt.Errorf("failed to parse HTML: %w", err)
155 }
156
157 // Find the __NEXT_DATA__ script tag
158 var jsonContent string
159 doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) {
160 rawContent := s.Text()
161 jsonContent = cleanJSONString(rawContent)
162 })
163
164 if jsonContent == "" {
165 return nil, fmt.Errorf("no __NEXT_DATA__ script tag found")
166 }
167
168 // Parse the JSON
169 var nextData NextData
170 err = json.Unmarshal([]byte(jsonContent), &nextData)
171 if err != nil {
172 return nil, fmt.Errorf("failed to parse JSON: %w", err)
173 }
174
175 // Extract services from the nested structure
176 var departures []Departure
177 if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 &&
178 len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 {
179
180 services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services
181
182 for _, service := range services {
183 departure := convertJSONServiceToDeparture(service)
184 departures = append(departures, departure)
185 }
186 }
187
188 return departures, nil
189}
190
191func convertJSONServiceToDeparture(service JSONService) Departure {
192 departure := Departure{
193 ServiceID: service.RID,
194 Platform: service.Platform,
195 Stops: service.JourneyDetails.Stops,
196 Operator: service.Operator.Name,
197 }
198
199 // Extract destination
200 if len(service.Destination) > 0 {
201 departure.Destination = service.Destination[0].LocationName
202 if service.Destination[0].Via != nil && *service.Destination[0].Via != "" {
203 departure.Via = *service.Destination[0].Via
204 }
205 }
206
207 // Parse scheduled time
208 if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil {
209 departure.ScheduledTime = scheduledTime.Format("15:04")
210 }
211
212 // Parse estimated/actual time and set status
213 if service.JourneyDetails.DepartureInfo.Actual != nil {
214 // Train has already departed
215 if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil {
216 departure.ExpectedTime = actualTime.Format("15:04")
217 departure.Status = "Departed " + departure.ExpectedTime
218 }
219 } else if service.JourneyDetails.DepartureInfo.Estimated != nil {
220 // Train has estimated departure time
221 if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil {
222 departure.ExpectedTime = estimatedTime.Format("15:04")
223 if departure.ExpectedTime == departure.ScheduledTime {
224 departure.Status = "On time"
225 } else {
226 departure.Status = "Expected " + departure.ExpectedTime
227 }
228 }
229 } else {
230 // Use service status
231 switch service.Status.Status {
232 case "OnTime":
233 departure.Status = "On time"
234 case "Late":
235 departure.Status = "Late"
236 case "Cancelled":
237 departure.Status = "Cancelled"
238 default:
239 departure.Status = service.Status.Status
240 }
241 }
242
243 // Calculate duration (rough estimate)
244 if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil {
245 if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil {
246 duration := scheduledArr.Sub(scheduledDep)
247 departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes()))
248 }
249 }
250
251 // Add delay reason if available
252 if service.Status.DelayReason != nil && *service.Status.DelayReason != "" {
253 departure.DelayReason = *service.Status.DelayReason
254 }
255
256 return departure
257}
258
259// cleanJSONString fixes common JSON issues from HTML extraction
260func cleanJSONString(s string) string {
261 s = strings.TrimSpace(s)
262
263 // Fix newlines within string literals
264 // This is a simplified approach - we replace newlines within quoted strings
265 re := regexp.MustCompile(`"([^"]*\n[^"]*)"`)
266 s = re.ReplaceAllStringFunc(s, func(match string) string {
267 // Remove quotes, replace newlines with spaces, add quotes back
268 inner := match[1 : len(match)-1] // Remove surrounding quotes
269 inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ")
270 return `"` + inner + `"`
271 })
272
273 return s
274}