dashboard of nationalrail train times

sundial: merge json and html scraping approach

Signed-off-by: oppiliappan <me@oppi.li>

oppi.li 89caa6c0 769d7446

verified
Changed files
+195 -203
-202
json_parser.go
··· 1 - package main 2 - 3 - import ( 4 - "encoding/json" 5 - "fmt" 6 - "regexp" 7 - "strings" 8 - "time" 9 - 10 - "github.com/PuerkitoBio/goquery" 11 - ) 12 - 13 - // NextData represents the structure of the __NEXT_DATA__ JSON 14 - type NextData struct { 15 - Props struct { 16 - PageProps struct { 17 - LiveTrainsState struct { 18 - Queries []struct { 19 - State struct { 20 - Data struct { 21 - Pages []struct { 22 - Services []JSONService `json:"services"` 23 - } `json:"pages"` 24 - } `json:"data"` 25 - } `json:"state"` 26 - } `json:"queries"` 27 - } `json:"liveTrainsState"` 28 - } `json:"pageProps"` 29 - } `json:"props"` 30 - } 31 - 32 - type JSONService struct { 33 - RID string `json:"rid"` 34 - TrainUID string `json:"trainUid"` 35 - Origin []struct { 36 - LocationName string `json:"locationName"` 37 - CRS string `json:"crs"` 38 - Via *string `json:"via"` 39 - } `json:"origin"` 40 - Destination []struct { 41 - LocationName string `json:"locationName"` 42 - CRS string `json:"crs"` 43 - Via *string `json:"via"` 44 - } `json:"destination"` 45 - JourneyDetails struct { 46 - From struct { 47 - LocationName string `json:"locationName"` 48 - CRS string `json:"crs"` 49 - } `json:"from"` 50 - To struct { 51 - LocationName string `json:"locationName"` 52 - CRS string `json:"crs"` 53 - } `json:"to"` 54 - Stops int `json:"stops"` 55 - DepartureInfo struct { 56 - Scheduled string `json:"scheduled"` 57 - Estimated *string `json:"estimated"` 58 - Actual *string `json:"actual"` 59 - } `json:"departureInfo"` 60 - ArrivalInfo struct { 61 - Scheduled string `json:"scheduled"` 62 - Estimated *string `json:"estimated"` 63 - Actual *string `json:"actual"` 64 - } `json:"arrivalInfo"` 65 - } `json:"journeyDetails"` 66 - Operator struct { 67 - Name string `json:"name"` 68 - Code string `json:"code"` 69 - } `json:"operator"` 70 - Status struct { 71 - Status string `json:"status"` 72 - DelayReason *string `json:"delayReason"` 73 - CancelReason *string `json:"cancelReason"` 74 - } `json:"status"` 75 - Platform string `json:"platform"` 76 - } 77 - 78 - // ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON 79 - func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) { 80 - doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) 81 - if err != nil { 82 - return nil, fmt.Errorf("failed to parse HTML: %w", err) 83 - } 84 - 85 - // Find the __NEXT_DATA__ script tag 86 - var jsonContent string 87 - doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) { 88 - rawContent := s.Text() 89 - jsonContent = cleanJSONString(rawContent) 90 - }) 91 - 92 - if jsonContent == "" { 93 - return nil, fmt.Errorf("no __NEXT_DATA__ script tag found") 94 - } 95 - 96 - // Parse the JSON 97 - var nextData NextData 98 - err = json.Unmarshal([]byte(jsonContent), &nextData) 99 - if err != nil { 100 - return nil, fmt.Errorf("failed to parse JSON: %w", err) 101 - } 102 - 103 - // Extract services from the nested structure 104 - var departures []Departure 105 - if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 && 106 - len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 { 107 - 108 - services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services 109 - 110 - for _, service := range services { 111 - departure := convertJSONServiceToDeparture(service) 112 - departures = append(departures, departure) 113 - } 114 - } 115 - 116 - return departures, nil 117 - } 118 - 119 - func convertJSONServiceToDeparture(service JSONService) Departure { 120 - departure := Departure{ 121 - ServiceID: service.RID, 122 - Platform: service.Platform, 123 - Stops: service.JourneyDetails.Stops, 124 - Operator: service.Operator.Name, 125 - } 126 - 127 - // Extract destination 128 - if len(service.Destination) > 0 { 129 - departure.Destination = service.Destination[0].LocationName 130 - if service.Destination[0].Via != nil && *service.Destination[0].Via != "" { 131 - departure.Via = *service.Destination[0].Via 132 - } 133 - } 134 - 135 - // Parse scheduled time 136 - if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil { 137 - departure.ScheduledTime = scheduledTime.Format("15:04") 138 - } 139 - 140 - // Parse estimated/actual time and set status 141 - if service.JourneyDetails.DepartureInfo.Actual != nil { 142 - // Train has already departed 143 - if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil { 144 - departure.ExpectedTime = actualTime.Format("15:04") 145 - departure.Status = "Departed " + departure.ExpectedTime 146 - } 147 - } else if service.JourneyDetails.DepartureInfo.Estimated != nil { 148 - // Train has estimated departure time 149 - if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil { 150 - departure.ExpectedTime = estimatedTime.Format("15:04") 151 - if departure.ExpectedTime == departure.ScheduledTime { 152 - departure.Status = "On time" 153 - } else { 154 - departure.Status = "Expected " + departure.ExpectedTime 155 - } 156 - } 157 - } else { 158 - // Use service status 159 - switch service.Status.Status { 160 - case "OnTime": 161 - departure.Status = "On time" 162 - case "Late": 163 - departure.Status = "Late" 164 - case "Cancelled": 165 - departure.Status = "Cancelled" 166 - default: 167 - departure.Status = service.Status.Status 168 - } 169 - } 170 - 171 - // Calculate duration (rough estimate) 172 - if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil { 173 - if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil { 174 - duration := scheduledArr.Sub(scheduledDep) 175 - departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes())) 176 - } 177 - } 178 - 179 - // Add delay reason if available 180 - if service.Status.DelayReason != nil && *service.Status.DelayReason != "" { 181 - departure.DelayReason = *service.Status.DelayReason 182 - } 183 - 184 - return departure 185 - } 186 - 187 - // cleanJSONString fixes common JSON issues from HTML extraction 188 - func cleanJSONString(s string) string { 189 - s = strings.TrimSpace(s) 190 - 191 - // Fix newlines within string literals 192 - // This is a simplified approach - we replace newlines within quoted strings 193 - re := regexp.MustCompile(`"([^"]*\n[^"]*)"`) 194 - s = re.ReplaceAllStringFunc(s, func(match string) string { 195 - // Remove quotes, replace newlines with spaces, add quotes back 196 - inner := match[1 : len(match)-1] // Remove surrounding quotes 197 - inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ") 198 - return `"` + inner + `"` 199 - }) 200 - 201 - return s 202 - }
+195 -1
parser.go
··· 1 1 package main 2 2 3 3 import ( 4 + "encoding/json" 4 5 "fmt" 5 6 "os" 7 + "regexp" 6 8 "strings" 9 + "time" 7 10 8 11 "github.com/PuerkitoBio/goquery" 9 12 ) ··· 77 80 } 78 81 79 82 return ParseDeparturesFromHTML(string(content)) 80 - } 83 + } 84 + 85 + // NextData represents the structure of the __NEXT_DATA__ JSON 86 + type NextData struct { 87 + Props struct { 88 + PageProps struct { 89 + LiveTrainsState struct { 90 + Queries []struct { 91 + State struct { 92 + Data struct { 93 + Pages []struct { 94 + Services []JSONService `json:"services"` 95 + } `json:"pages"` 96 + } `json:"data"` 97 + } `json:"state"` 98 + } `json:"queries"` 99 + } `json:"liveTrainsState"` 100 + } `json:"pageProps"` 101 + } `json:"props"` 102 + } 103 + 104 + type JSONService struct { 105 + RID string `json:"rid"` 106 + TrainUID string `json:"trainUid"` 107 + Origin []struct { 108 + LocationName string `json:"locationName"` 109 + CRS string `json:"crs"` 110 + Via *string `json:"via"` 111 + } `json:"origin"` 112 + Destination []struct { 113 + LocationName string `json:"locationName"` 114 + CRS string `json:"crs"` 115 + Via *string `json:"via"` 116 + } `json:"destination"` 117 + JourneyDetails struct { 118 + From struct { 119 + LocationName string `json:"locationName"` 120 + CRS string `json:"crs"` 121 + } `json:"from"` 122 + To struct { 123 + LocationName string `json:"locationName"` 124 + CRS string `json:"crs"` 125 + } `json:"to"` 126 + Stops int `json:"stops"` 127 + DepartureInfo struct { 128 + Scheduled string `json:"scheduled"` 129 + Estimated *string `json:"estimated"` 130 + Actual *string `json:"actual"` 131 + } `json:"departureInfo"` 132 + ArrivalInfo struct { 133 + Scheduled string `json:"scheduled"` 134 + Estimated *string `json:"estimated"` 135 + Actual *string `json:"actual"` 136 + } `json:"arrivalInfo"` 137 + } `json:"journeyDetails"` 138 + Operator struct { 139 + Name string `json:"name"` 140 + Code string `json:"code"` 141 + } `json:"operator"` 142 + Status struct { 143 + Status string `json:"status"` 144 + DelayReason *string `json:"delayReason"` 145 + CancelReason *string `json:"cancelReason"` 146 + } `json:"status"` 147 + Platform string `json:"platform"` 148 + } 149 + 150 + // ParseDeparturesFromJSONHTML extracts departure information from HTML containing __NEXT_DATA__ JSON 151 + func ParseDeparturesFromJSONHTML(htmlContent string) ([]Departure, error) { 152 + doc, err := goquery.NewDocumentFromReader(strings.NewReader(htmlContent)) 153 + if err != nil { 154 + return nil, fmt.Errorf("failed to parse HTML: %w", err) 155 + } 156 + 157 + // Find the __NEXT_DATA__ script tag 158 + var jsonContent string 159 + doc.Find("script#__NEXT_DATA__").Each(func(i int, s *goquery.Selection) { 160 + rawContent := s.Text() 161 + jsonContent = cleanJSONString(rawContent) 162 + }) 163 + 164 + if jsonContent == "" { 165 + return nil, fmt.Errorf("no __NEXT_DATA__ script tag found") 166 + } 167 + 168 + // Parse the JSON 169 + var nextData NextData 170 + err = json.Unmarshal([]byte(jsonContent), &nextData) 171 + if err != nil { 172 + return nil, fmt.Errorf("failed to parse JSON: %w", err) 173 + } 174 + 175 + // Extract services from the nested structure 176 + var departures []Departure 177 + if len(nextData.Props.PageProps.LiveTrainsState.Queries) > 0 && 178 + len(nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages) > 0 { 179 + 180 + services := nextData.Props.PageProps.LiveTrainsState.Queries[0].State.Data.Pages[0].Services 181 + 182 + for _, service := range services { 183 + departure := convertJSONServiceToDeparture(service) 184 + departures = append(departures, departure) 185 + } 186 + } 187 + 188 + return departures, nil 189 + } 190 + 191 + func convertJSONServiceToDeparture(service JSONService) Departure { 192 + departure := Departure{ 193 + ServiceID: service.RID, 194 + Platform: service.Platform, 195 + Stops: service.JourneyDetails.Stops, 196 + Operator: service.Operator.Name, 197 + } 198 + 199 + // Extract destination 200 + if len(service.Destination) > 0 { 201 + departure.Destination = service.Destination[0].LocationName 202 + if service.Destination[0].Via != nil && *service.Destination[0].Via != "" { 203 + departure.Via = *service.Destination[0].Via 204 + } 205 + } 206 + 207 + // Parse scheduled time 208 + if scheduledTime, err := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err == nil { 209 + departure.ScheduledTime = scheduledTime.Format("15:04") 210 + } 211 + 212 + // Parse estimated/actual time and set status 213 + if service.JourneyDetails.DepartureInfo.Actual != nil { 214 + // Train has already departed 215 + if actualTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Actual); err == nil { 216 + departure.ExpectedTime = actualTime.Format("15:04") 217 + departure.Status = "Departed " + departure.ExpectedTime 218 + } 219 + } else if service.JourneyDetails.DepartureInfo.Estimated != nil { 220 + // Train has estimated departure time 221 + if estimatedTime, err := time.Parse(time.RFC3339, *service.JourneyDetails.DepartureInfo.Estimated); err == nil { 222 + departure.ExpectedTime = estimatedTime.Format("15:04") 223 + if departure.ExpectedTime == departure.ScheduledTime { 224 + departure.Status = "On time" 225 + } else { 226 + departure.Status = "Expected " + departure.ExpectedTime 227 + } 228 + } 229 + } else { 230 + // Use service status 231 + switch service.Status.Status { 232 + case "OnTime": 233 + departure.Status = "On time" 234 + case "Late": 235 + departure.Status = "Late" 236 + case "Cancelled": 237 + departure.Status = "Cancelled" 238 + default: 239 + departure.Status = service.Status.Status 240 + } 241 + } 242 + 243 + // Calculate duration (rough estimate) 244 + if scheduledDep, err1 := time.Parse(time.RFC3339, service.JourneyDetails.DepartureInfo.Scheduled); err1 == nil { 245 + if scheduledArr, err2 := time.Parse(time.RFC3339, service.JourneyDetails.ArrivalInfo.Scheduled); err2 == nil { 246 + duration := scheduledArr.Sub(scheduledDep) 247 + departure.Duration = fmt.Sprintf("%d minutes", int(duration.Minutes())) 248 + } 249 + } 250 + 251 + // Add delay reason if available 252 + if service.Status.DelayReason != nil && *service.Status.DelayReason != "" { 253 + departure.DelayReason = *service.Status.DelayReason 254 + } 255 + 256 + return departure 257 + } 258 + 259 + // cleanJSONString fixes common JSON issues from HTML extraction 260 + func cleanJSONString(s string) string { 261 + s = strings.TrimSpace(s) 262 + 263 + // Fix newlines within string literals 264 + // This is a simplified approach - we replace newlines within quoted strings 265 + re := regexp.MustCompile(`"([^"]*\n[^"]*)"`) 266 + s = re.ReplaceAllStringFunc(s, func(match string) string { 267 + // Remove quotes, replace newlines with spaces, add quotes back 268 + inner := match[1 : len(match)-1] // Remove surrounding quotes 269 + inner = regexp.MustCompile(`\s*\n\s*`).ReplaceAllString(inner, " ") 270 + return `"` + inner + `"` 271 + }) 272 + 273 + return s 274 + }