dashboard of nationalrail train times

sundial: scraping logic from nationalrail

Signed-off-by: oppiliappan <me@oppi.li>

oppi.li 769d7446 a62c4e01

verified
+36
config.go
··· 1 + package main 2 + 3 + import ( 4 + "fmt" 5 + "os" 6 + 7 + "gopkg.in/yaml.v3" 8 + ) 9 + 10 + type Config struct { 11 + Server struct { 12 + Port int `yaml:"port"` 13 + RefreshInterval int `yaml:"refresh_interval"` 14 + } `yaml:"server"` 15 + Stations []Station `yaml:"stations"` 16 + } 17 + 18 + type Station struct { 19 + Name string `yaml:"name"` 20 + Code string `yaml:"code"` 21 + } 22 + 23 + func LoadConfig(filename string) (*Config, error) { 24 + data, err := os.ReadFile(filename) 25 + if err != nil { 26 + return nil, fmt.Errorf("failed to read config file: %w", err) 27 + } 28 + 29 + var config Config 30 + err = yaml.Unmarshal(data, &config) 31 + if err != nil { 32 + return nil, fmt.Errorf("failed to parse config file: %w", err) 33 + } 34 + 35 + return &config, nil 36 + }
+9
config.yaml
··· 1 + server: 2 + port: 8080 3 + refresh_interval: 60 # seconds 4 + 5 + stations: 6 + - name: "Lewisham" 7 + code: "lewisham" 8 + - name: "Ladywell" 9 + code: "ladywell"
+60
departure.go
··· 1 + package main 2 + 3 + import ( 4 + "regexp" 5 + "strconv" 6 + "strings" 7 + ) 8 + 9 + type Departure struct { 10 + ScheduledTime string `json:"scheduled_time"` 11 + ExpectedTime string `json:"expected_time,omitempty"` 12 + Status string `json:"status"` // "On time", "Expected", "Delayed", etc. 13 + Destination string `json:"destination"` 14 + Via string `json:"via,omitempty"` 15 + Platform string `json:"platform"` 16 + Duration string `json:"duration"` 17 + Stops int `json:"stops"` 18 + Operator string `json:"operator"` 19 + DelayReason string `json:"delay_reason,omitempty"` 20 + ServiceID string `json:"service_id,omitempty"` 21 + } 22 + 23 + // ParseAriaLabel extracts departure information from the aria-label attribute 24 + func ParseAriaLabel(ariaLabel string) (*Departure, error) { 25 + departure := &Departure{} 26 + 27 + // Handle delayed trains first 28 + delayPattern := regexp.MustCompile(`^(.*?), (\d{2}:\d{2}), Expected (\d{2}:\d{2}), service for ([^,]+)(?:, via ([^,]+))?, calling at [^,]+, from platform (\d+), duration (\d+) minutes, (\d+) stops, operated by (.+)$`) 29 + if matches := delayPattern.FindStringSubmatch(ariaLabel); matches != nil { 30 + departure.DelayReason = matches[1] 31 + departure.ScheduledTime = matches[2] 32 + departure.ExpectedTime = matches[3] 33 + departure.Status = "Expected " + matches[3] 34 + departure.Destination = strings.TrimSpace(matches[4]) 35 + departure.Via = strings.TrimSpace(matches[5]) 36 + departure.Platform = matches[6] 37 + departure.Duration = matches[7] + " minutes" 38 + stops, _ := strconv.Atoi(matches[8]) 39 + departure.Stops = stops 40 + departure.Operator = strings.TrimSpace(matches[9]) 41 + return departure, nil 42 + } 43 + 44 + // Handle normal trains 45 + normalPattern := regexp.MustCompile(`^(\d{2}:\d{2}), (On time|Cancelled), service for ([^,]+)(?:, via ([^,]+))?, calling at [^,]+, from platform (\d+), duration (\d+) minutes, (\d+) stops, operated by (.+)$`) 46 + if matches := normalPattern.FindStringSubmatch(ariaLabel); matches != nil { 47 + departure.ScheduledTime = matches[1] 48 + departure.Status = matches[2] 49 + departure.Destination = strings.TrimSpace(matches[3]) 50 + departure.Via = strings.TrimSpace(matches[4]) 51 + departure.Platform = matches[5] 52 + departure.Duration = matches[6] + " minutes" 53 + stops, _ := strconv.Atoi(matches[7]) 54 + departure.Stops = stops 55 + departure.Operator = strings.TrimSpace(matches[8]) 56 + return departure, nil 57 + } 58 + 59 + return nil, nil // Unable to parse 60 + }
+79
fetcher.go
··· 1 + package main 2 + 3 + import ( 4 + "fmt" 5 + "io" 6 + "net/http" 7 + "time" 8 + ) 9 + 10 + const BaseURL = "https://www.nationalrail.co.uk/live-trains/departures/" 11 + 12 + // FetchStationDepartures fetches the HTML content for a station's departures 13 + func FetchStationDepartures(stationCode string) (string, error) { 14 + client := &http.Client{ 15 + Timeout: 30 * time.Second, 16 + } 17 + 18 + url := BaseURL + stationCode + "/" 19 + req, err := http.NewRequest("GET", url, nil) 20 + if err != nil { 21 + return "", fmt.Errorf("failed to create request: %w", err) 22 + } 23 + 24 + // Add headers to mimic a browser request 25 + req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36") 26 + req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8") 27 + 28 + resp, err := client.Do(req) 29 + if err != nil { 30 + return "", fmt.Errorf("failed to fetch data: %w", err) 31 + } 32 + defer resp.Body.Close() 33 + 34 + if resp.StatusCode != http.StatusOK { 35 + return "", fmt.Errorf("unexpected status code: %d", resp.StatusCode) 36 + } 37 + 38 + body, err := io.ReadAll(resp.Body) 39 + if err != nil { 40 + return "", fmt.Errorf("failed to read response body: %w", err) 41 + } 42 + 43 + return string(body), nil 44 + } 45 + 46 + type StationData struct { 47 + Station Station `json:"station"` 48 + Departures []Departure `json:"departures"` 49 + LastUpdate time.Time `json:"last_update"` 50 + Error string `json:"error,omitempty"` 51 + } 52 + 53 + // FetchAllStationsData fetches departure data for all configured stations 54 + func FetchAllStationsData(stations []Station) []StationData { 55 + results := make([]StationData, len(stations)) 56 + 57 + for i, station := range stations { 58 + results[i] = StationData{ 59 + Station: station, 60 + LastUpdate: time.Now(), 61 + } 62 + 63 + htmlContent, err := FetchStationDepartures(station.Code) 64 + if err != nil { 65 + results[i].Error = err.Error() 66 + continue 67 + } 68 + 69 + departures, err := ParseDeparturesFromHTML(htmlContent) 70 + if err != nil { 71 + results[i].Error = err.Error() 72 + continue 73 + } 74 + 75 + results[i].Departures = departures 76 + } 77 + 78 + return results 79 + }
+202
json_parser.go
··· 1 + package main 2 + 3 + import ( 4 + "encoding/json" 5 + "fmt" 6 + "regexp" 7 + "strings" 8 + "time" 9 + 10 + "github.com/PuerkitoBio/goquery" 11 + ) 12 + 13 + // NextData represents the structure of the __NEXT_DATA__ JSON 14 + type NextData struct { 15 + Props struct { 16 + PageProps struct { 17 + LiveTrainsState struct { 18 + Queries []struct { 19 + State struct { 20 + Data struct { 21 + Pages []struct { 22 + Services []JSONService `json:"services"` 23 + } `json:"pages"` 24 + } `json:"data"` 25 + } `json:"state"` 26 + } `json:"queries"` 27 + } `json:"liveTrainsState"` 28 + } `json:"pageProps"` 29 + } `json:"props"` 30 + } 31 + 32 + type JSONService struct { 33 + RID string `json:"rid"` 34 + TrainUID string `json:"trainUid"` 35 + Origin []struct { 36 + LocationName string `json:"locationName"` 37 + CRS string `json:"crs"` 38 + Via *string `json:"via"` 39 + } `json:"origin"` 40 + Destination []struct { 41 + LocationName string `json:"locationName"` 42 + CRS string `json:"crs"` 43 + Via *string `json:"via"` 44 + } `json:"destination"` 45 + JourneyDetails struct { 46 + From struct { 47 + LocationName string `json:"locationName"` 48 + CRS string `json:"crs"` 49 + } `json:"from"` 50 + To struct { 51 + LocationName string `json:"locationName"` 52 + CRS string `json:"crs"` 53 + } `json:"to"` 54 + Stops int `json:"stops"` 55 + DepartureInfo struct { 56 + Scheduled string `json:"scheduled"` 57 + Estimated *string `json:"estimated"` 58 + Actual *string `json:"actual"` 59 + } `json:"departureInfo"` 60 + ArrivalInfo struct { 61 + Scheduled string `json:"scheduled"` 62 + Estimated *string `json:"estimated"` 63 + Actual *string `json:"actual"` 64 + } `json:"arrivalInfo"` 65 + } `json:"journeyDetails"` 66 + Operator struct { 67 + Name string `json:"name"` 68 + Code string `json:"code"` 69 + } `json:"operator"` 70 + Status struct { 71 + Status string `json:"status"` 72 + DelayReason *string `json:"delayReason"` 73 + CancelReason *string `json:"cancelReason"` 74 + } `json:"status"` 75 + Platform string `json:"platform"` 76 + } 77 + 78 + // ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON 79 + func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) { 80 + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) 81 + if err != nil { 82 + return nil, fmt.Errorf("failed to parse HTML: %w", err) 83 + } 84 + 85 + // Find the __NEXT_DATA__ script tag 86 + var jsonContent string 87 + doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) { 88 + rawContent := s.Text() 89 + jsonContent = cleanJSONString(rawContent) 90 + }) 91 + 92 + if jsonContent == "" { 93 + return nil, fmt.Errorf("no __NEXT_DATA__ script tag found") 94 + } 95 + 96 + // Parse the JSON 97 + var nextData NextData 98 + err = json.Unmarshal([]byte(jsonContent), &nextData) 99 + if err != nil { 100 + return nil, fmt.Errorf("failed to parse JSON: %w", err) 101 + } 102 + 103 + // Extract services from the nested structure 104 + var departures []Departure 105 + if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 && 106 + len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 { 107 + 108 + services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services 109 + 110 + for _, service := range services { 111 + departure := convertJSONServiceToDeparture(service) 112 + departures = append(departures, departure) 113 + } 114 + } 115 + 116 + return departures, nil 117 + } 118 + 119 + func convertJSONServiceToDeparture(service JSONService) Departure { 120 + departure := Departure{ 121 + ServiceID: service.RID, 122 + Platform: service.Platform, 123 + Stops: service.JourneyDetails.Stops, 124 + Operator: service.Operator.Name, 125 + } 126 + 127 + // Extract destination 128 + if len(service.Destination) > 0 { 129 + departure.Destination = service.Destination[0].LocationName 130 + if service.Destination[0].Via != nil && *service.Destination[0].Via != "" { 131 + departure.Via = *service.Destination[0].Via 132 + } 133 + } 134 + 135 + // Parse scheduled time 136 + if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil { 137 + departure.ScheduledTime = scheduledTime.Format("15:04") 138 + } 139 + 140 + // Parse estimated/actual time and set status 141 + if service.JourneyDetails.DepartureInfo.Actual != nil { 142 + // Train has already departed 143 + if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil { 144 + departure.ExpectedTime = actualTime.Format("15:04") 145 + departure.Status = "Departed " + departure.ExpectedTime 146 + } 147 + } else if service.JourneyDetails.DepartureInfo.Estimated != nil { 148 + // Train has estimated departure time 149 + if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil { 150 + departure.ExpectedTime = estimatedTime.Format("15:04") 151 + if departure.ExpectedTime == departure.ScheduledTime { 152 + departure.Status = "On time" 153 + } else { 154 + departure.Status = "Expected " + departure.ExpectedTime 155 + } 156 + } 157 + } else { 158 + // Use service status 159 + switch service.Status.Status { 160 + case "OnTime": 161 + departure.Status = "On time" 162 + case "Late": 163 + departure.Status = "Late" 164 + case "Cancelled": 165 + departure.Status = "Cancelled" 166 + default: 167 + departure.Status = service.Status.Status 168 + } 169 + } 170 + 171 + // Calculate duration (rough estimate) 172 + if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil { 173 + if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil { 174 + duration := scheduledArr.Sub(scheduledDep) 175 + departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes())) 176 + } 177 + } 178 + 179 + // Add delay reason if available 180 + if service.Status.DelayReason != nil && *service.Status.DelayReason != "" { 181 + departure.DelayReason = *service.Status.DelayReason 182 + } 183 + 184 + return departure 185 + } 186 + 187 + // cleanJSONString fixes common JSON issues from HTML extraction 188 + func cleanJSONString(s string) string { 189 + s = strings.TrimSpace(s) 190 + 191 + // Fix newlines within string literals 192 + // This is a simplified approach - we replace newlines within quoted strings 193 + re := regexp.MustCompile(`"([^"]*\n[^"]*)"`) 194 + s = re.ReplaceAllStringFunc(s, func(match string) string { 195 + // Remove quotes, replace newlines with spaces, add quotes back 196 + inner := match[1 : len(match)-1] // Remove surrounding quotes 197 + inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ") 198 + return `"` + inner + `"` 199 + }) 200 + 201 + return s 202 + }
+72
main.go
··· 1 + package main 2 + 3 + import ( 4 + "encoding/json" 5 + "fmt" 6 + "log" 7 + "os" 8 + ) 9 + 10 + func main() { 11 + if len(os.Args) < 2 { 12 + fmt.Println("Sundial - Live Train Departures") 13 + fmt.Println("Usage:") 14 + fmt.Println(" go run . server - Start web server") 15 + fmt.Println(" go run . cli - Start interactive CLI") 16 + fmt.Println(" go run . parse <html_file> - Parse HTML file") 17 + fmt.Println("Examples:") 18 + fmt.Println(" go run . server") 19 + fmt.Println(" go run . cli") 20 + fmt.Println(" go run . parse sample.html") 21 + os.Exit(1) 22 + } 23 + 24 + command := os.Args[1] 25 + 26 + switch command { 27 + case "server": 28 + runServer() 29 + case "cli": 30 + err := runCLI() 31 + if err != nil { 32 + log.Fatalf("CLI error: %v", err) 33 + } 34 + case "parse": 35 + if len(os.Args) < 3 { 36 + fmt.Println("Usage: go run . parse <html_file>") 37 + os.Exit(1) 38 + } 39 + parseFile(os.Args[2]) 40 + default: 41 + // Backward compatibility - treat first arg as filename 42 + parseFile(command) 43 + } 44 + } 45 + 46 + func runServer() { 47 + config, err := LoadConfig("config.yaml") 48 + if err != nil { 49 + log.Fatalf("Error loading config: %v", err) 50 + } 51 + 52 + server := NewServer(config) 53 + log.Fatal(server.Start()) 54 + } 55 + 56 + func parseFile(filename string) { 57 + departures, err := ParseDeparturesFromFile(filename) 58 + if err != nil { 59 + log.Fatalf("Error parsing departures: %v", err) 60 + } 61 + 62 + fmt.Printf("Found %d departures:\n\n", len(departures)) 63 + 64 + // Print as formatted JSON 65 + jsonData, err := json.MarshalIndent(departures, "", " ") 66 + if err != nil { 67 + log.Fatalf("Error marshaling to JSON: %v", err) 68 + } 69 + 70 + fmt.Println(string(jsonData)) 71 + } 72 +
+80
parser.go
··· 1 + package main 2 + 3 + import ( 4 + "fmt" 5 + "os" 6 + "strings" 7 + 8 + "github.com/PuerkitoBio/goquery" 9 + ) 10 + 11 + // ParseDeparturesFromHTML extracts departure information from the National Rail HTML 12 + func ParseDeparturesFromHTML(htmlContent string) ([]Departure, error) { 13 + // Try JSON approach first (for new React-based site) 14 + if departures, err := ParseDeparturesFromJSONHTML(htmlContent); err == nil && len(departures) > 0 { 15 + return departures, nil 16 + } 17 + 18 + // Fallback to old HTML parsing approach 19 + return parseFromAriaLabels(htmlContent) 20 + } 21 + 22 + func parseFromAriaLabels(htmlContent string) ([]Departure, error) { 23 + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) 24 + if err != nil { 25 + return nil, fmt.Errorf("failed to parse HTML: %w", err) 26 + } 27 + 28 + seenServices := make(map[string]bool) 29 + var departures []Departure 30 + 31 + // Find all anchor tags with aria-label containing service information 32 + doc.Find("a[aria-label*='service for']").Each(func(i int, s *goquery.Selection) { 33 + ariaLabel, exists := s.Attr("aria-label") 34 + if !exists { 35 + return 36 + } 37 + 38 + // Extract service ID from href if available 39 + href, _ := s.Attr("href") 40 + serviceID := extractServiceID(href) 41 + 42 + // Skip if we've already processed this service 43 + if seenServices[serviceID] { 44 + return 45 + } 46 + seenServices[serviceID] = true 47 + 48 + departure, err := ParseAriaLabel(ariaLabel) 49 + if err != nil || departure == nil { 50 + return 51 + } 52 + 53 + departure.ServiceID = serviceID 54 + departures = append(departures, *departure) 55 + }) 56 + 57 + return departures, nil 58 + } 59 + 60 + // extractServiceID extracts the service ID from the href URL 61 + func extractServiceID(href string) string { 62 + if strings.Contains(href, "sid=") { 63 + parts := strings.Split(href, "sid=") 64 + if len(parts) > 1 { 65 + sidPart := strings.Split(parts[1], "&")[0] 66 + return sidPart 67 + } 68 + } 69 + return "" 70 + } 71 + 72 + // ParseDeparturesFromFile reads an HTML file and extracts departure information 73 + func ParseDeparturesFromFile(filename string) ([]Departure, error) { 74 + content, err := os.ReadFile(filename) 75 + if err != nil { 76 + return nil, fmt.Errorf("failed to read file %s: %w", filename, err) 77 + } 78 + 79 + return ParseDeparturesFromHTML(string(content)) 80 + }