package main import ( "encoding/json" "fmt" "os" "regexp" "strings" "time" "github.com/PuerkitoBio/goquery" ) // ParseDeparturesFromHTML extracts departure information from the National Rail HTML func ParseDeparturesFromHTML(htmlContent string) ([]Departure, error) { // Try JSON approach first (for new React-based site) if departures, err := ParseDeparturesFromJSONHTML(htmlContent); err == nil && len(departures) > 0 { return departures, nil } // Fallback to old HTML parsing approach return parseFromAriaLabels(htmlContent) } func parseFromAriaLabels(htmlContent string) ([]Departure, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err != nil { return nil, fmt.Errorf("failed to parse HTML: %w", err) } seenServices := make(map[string]bool) var departures []Departure // Find all anchor tags with aria-label containing service information doc.Find("a[aria-label*='service for']").Each(func(i int, s *goquery.Selection) { ariaLabel, exists := s.Attr("aria-label") if !exists { return } // Extract service ID from href if available href, _ := s.Attr("href") serviceID := extractServiceID(href) // Skip if we've already processed this service if seenServices[serviceID] { return } seenServices[serviceID] = true departure, err := ParseAriaLabel(ariaLabel) if err != nil || departure == nil { return } departure.ServiceID = serviceID departures = append(departures, *departure) }) return departures, nil } // extractServiceID extracts the service ID from the href URL func extractServiceID(href string) string { if strings.Contains(href, "sid=") { parts := strings.Split(href, "sid=") if len(parts) > 1 { sidPart := strings.Split(parts[1], "&")[0] return sidPart } } return "" } // ParseDeparturesFromFile reads an HTML file and extracts departure information func ParseDeparturesFromFile(filename string) ([]Departure, error) { content, err := os.ReadFile(filename) if err != nil { return nil, fmt.Errorf("failed to read file %s: %w", filename, err) } return ParseDeparturesFromHTML(string(content)) } // NextData represents the structure of the __NEXT_DATA__ JSON type NextData struct { Props struct { PageProps struct { LiveTrainsState struct { Queries []struct { State struct { Data struct { Pages []struct { Services []JSONService `json:"services"` } `json:"pages"` } `json:"data"` } `json:"state"` } `json:"queries"` } `json:"liveTrainsState"` } `json:"pageProps"` } `json:"props"` } type JSONService struct { RID string `json:"rid"` TrainUID string `json:"trainUid"` Origin []struct { LocationName string `json:"locationName"` CRS string `json:"crs"` Via *string `json:"via"` } `json:"origin"` Destination []struct { LocationName string `json:"locationName"` CRS string `json:"crs"` Via *string `json:"via"` } `json:"destination"` JourneyDetails struct { From struct { LocationName string `json:"locationName"` CRS string `json:"crs"` } `json:"from"` To struct { LocationName string `json:"locationName"` CRS string `json:"crs"` } `json:"to"` Stops int `json:"stops"` DepartureInfo struct { Scheduled string `json:"scheduled"` Estimated *string `json:"estimated"` Actual *string `json:"actual"` } `json:"departureInfo"` ArrivalInfo struct { Scheduled string `json:"scheduled"` Estimated *string `json:"estimated"` Actual *string `json:"actual"` } `json:"arrivalInfo"` } `json:"journeyDetails"` Operator struct { Name string `json:"name"` Code string `json:"code"` } `json:"operator"` Status struct { Status string `json:"status"` DelayReason *string `json:"delayReason"` CancelReason *string `json:"cancelReason"` } `json:"status"` Platform string `json:"platform"` } // ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) { doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) if err != nil { return nil, fmt.Errorf("failed to parse HTML: %w", err) } // Find the __NEXT_DATA__ script tag var jsonContent string doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) { rawContent := s.Text() jsonContent = cleanJSONString(rawContent) }) if jsonContent == "" { return nil, fmt.Errorf("no __NEXT_DATA__ script tag found") } // Parse the JSON var nextData NextData err = json.Unmarshal([]byte(jsonContent), &nextData) if err != nil { return nil, fmt.Errorf("failed to parse JSON: %w", err) } // Extract services from the nested structure var departures []Departure if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 && len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 { services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services for _, service := range services { departure := convertJSONServiceToDeparture(service) departures = append(departures, departure) } } return departures, nil } func convertJSONServiceToDeparture(service JSONService) Departure { departure := Departure{ ServiceID: service.RID, Platform: service.Platform, Stops: service.JourneyDetails.Stops, Operator: service.Operator.Name, } // Extract destination if len(service.Destination) > 0 { departure.Destination = service.Destination[0].LocationName if service.Destination[0].Via != nil && *service.Destination[0].Via != "" { departure.Via = *service.Destination[0].Via } } // Parse scheduled time if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil { departure.ScheduledTime = scheduledTime.Format("15:04") } // Parse estimated/actual time and set status if service.JourneyDetails.DepartureInfo.Actual != nil { // Train has already departed if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil { departure.ExpectedTime = actualTime.Format("15:04") departure.Status = "Departed " + departure.ExpectedTime } } else if service.JourneyDetails.DepartureInfo.Estimated != nil { // Train has estimated departure time if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil { departure.ExpectedTime = estimatedTime.Format("15:04") if departure.ExpectedTime == departure.ScheduledTime { departure.Status = "On time" } else { departure.Status = "Expected " + departure.ExpectedTime } } } else { // Use service status switch service.Status.Status { case "OnTime": departure.Status = "On time" case "Late": departure.Status = "Late" case "Cancelled": departure.Status = "Cancelled" default: departure.Status = service.Status.Status } } // Calculate duration (rough estimate) if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil { if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil { duration := scheduledArr.Sub(scheduledDep) departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes())) } } // Add delay reason if available if service.Status.DelayReason != nil && *service.Status.DelayReason != "" { departure.DelayReason = *service.Status.DelayReason } return departure } // cleanJSONString fixes common JSON issues from HTML extraction func cleanJSONString(s string) string { s = strings.TrimSpace(s) // Fix newlines within string literals // This is a simplified approach - we replace newlines within quoted strings re := regexp.MustCompile(`"([^"]*\n[^"]*)"`) s = re.ReplaceAllStringFunc(s, func(match string) string { // Remove quotes, replace newlines with spaces, add quotes back inner := match[1 : len(match)-1] // Remove surrounding quotes inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ") return `"` + inner + `"` }) return s }