beebo
2
fork

Configure Feed

Select the types of activity you want to include in your feed.

archive.is -> archive.org

j3s.sh 8578094d 9d6fb2a0

verified
+136 -327
-206
archiveis/capture.go
··· 1 - package archiveis 2 - 3 - import ( 4 - "bytes" 5 - "errors" 6 - "fmt" 7 - "io" 8 - "log" 9 - "net/http" 10 - "net/url" 11 - "regexp" 12 - "strings" 13 - "time" 14 - 15 - "github.com/PuerkitoBio/goquery" 16 - ) 17 - 18 - var ( 19 - BaseURL = "https://archive.is" // Overrideable default package value. 20 - HTTPHost = "archive.is" // Overrideable default package value. 21 - UserAgent = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.162 Safari/537.36" // Overrideable default package value. 22 - DefaultRequestTimeout = 10 * time.Second // Overrideable default package value. 23 - DefaultPollInterval = 5 * time.Second // Overrideable default package value. 24 - 25 - jsLocationExpr = regexp.MustCompile(`document\.location\.replace\(["']([^"']+)`) 26 - ) 27 - 28 - // Config settings for page capture client behavior. 29 - type Config struct { 30 - Anyway bool // Force archival even if there is already a recent snapshot of the page. 31 - Wait bool // Wait until the crawl has been completed. 32 - WaitTimeout time.Duration // Max time to wait for crawl completion. Default is unlimited. 33 - PollInterval time.Duration // Interval between crawl completion checks. Defaults to 5s. 34 - RequestTimeout time.Duration // Overrides default request timeout. 35 - SubmitID string // Accepts a user-provided submitid. 36 - } 37 - 38 - // Capture archives the provided URL using the archive.is service. 39 - func Capture(u string, cfg ...Config) (string, error) { 40 - timeout := DefaultRequestTimeout 41 - if len(cfg) > 0 && cfg[0].RequestTimeout > time.Duration(0) { 42 - timeout = cfg[0].RequestTimeout 43 - } 44 - 45 - var ( 46 - submitID string 47 - anyway string 48 - body []byte 49 - resp *http.Response 50 - final string 51 - err error 52 - ) 53 - 54 - if len(cfg) > 0 && len(cfg[0].SubmitID) > 0 { 55 - submitID = cfg[0].SubmitID 56 - log.Printf("Will use caller-provided submitid=%v", submitID) 57 - } else if submitID, err = newSubmitID(timeout); err != nil { 58 - return "", err 59 - } 60 - 61 - if len(cfg) > 0 && cfg[0].Anyway { 62 - anyway = "&anyway=1" 63 - } 64 - 65 - content := fmt.Sprintf("submitid=%v&url=%v%v", url.QueryEscape(submitID), url.QueryEscape(u), anyway) 66 - 67 - resp, body, err = doRequest("POST", BaseURL+"/submit/", io.NopCloser(bytes.NewBufferString(content)), timeout) 68 - if err != nil { 69 - return "", err 70 - } 71 - 72 - if resp.StatusCode/100 == 3 { 73 - // Page has already been archived. 74 - log.Print("Detected redirect to archived page") 75 - 76 - if loc := resp.Header.Get("Location"); len(loc) == 0 { 77 - return "", fmt.Errorf("received a redirect status-code %v with an empty Location header", resp.StatusCode) 78 - } else { 79 - final = loc 80 - } 81 - } else { 82 - // log.Printf("body: %+v\n", string(body)) 83 - // log.Printf("headers: %+v\n", resp.Header) 84 - // log.Printf("trailers: %+v\n", resp.Trailer) 85 - 86 - doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) 87 - if err != nil { 88 - return "", fmt.Errorf("constructing goquery doc from submission response: %s", err) 89 - } 90 - 91 - if script := doc.Find("script").First(); script != nil { 92 - js := strings.Trim(script.Text(), "\r\n\t ") 93 - if match := jsLocationExpr.FindStringSubmatch(js); len(match) > 1 { 94 - final = match[1] 95 - } 96 - } 97 - 98 - if len(final) == 0 { 99 - input := doc.Find("input[name=id]").First() 100 - if input == nil { 101 - return "", errors.New("page archive ID not found in submission response content") 102 - } 103 - id, exists := input.Attr("value") 104 - if !exists { 105 - log.Printf("No page archive ID value detected, here was the page content: %v", string(body)) 106 - return "", errors.New("no page archive ID value available") 107 - } 108 - 109 - final = fmt.Sprintf("%v/%v", BaseURL, id) 110 - } 111 - } 112 - 113 - log.Printf("Capture for url=%v -> %v", u, final) 114 - 115 - if len(cfg) > 0 && cfg[0].Wait { 116 - var ( 117 - waitTimeout = cfg[0].WaitTimeout 118 - pollInterval = DefaultPollInterval 119 - ) 120 - 121 - if cfg[0].PollInterval > time.Duration(0) { 122 - pollInterval = cfg[0].PollInterval 123 - } 124 - 125 - if err := waitForCrawlToFinish(final, body, timeout, waitTimeout, pollInterval); err != nil { 126 - return final, err 127 - } 128 - } 129 - 130 - return final, nil 131 - } 132 - 133 - // newSubmitID gets the index page and extracts the form submission identifier. 134 - func newSubmitID(timeout time.Duration) (string, error) { 135 - _, body, err := doRequest("", BaseURL, nil, timeout) 136 - if err != nil { 137 - return "", err 138 - } 139 - 140 - doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) 141 - if err != nil { 142 - return "", fmt.Errorf("constructing goquery doc from index: %s", err) 143 - } 144 - 145 - input := doc.Find("input[name=submitid]").First() 146 - if input == nil { 147 - return "", errors.New("no submitid element found") 148 - } 149 - id, exists := input.Attr("value") 150 - if !exists { 151 - return "", errors.New("no submitid value available") 152 - } 153 - return id, nil 154 - } 155 - 156 - func waitForCrawlToFinish(url string, body []byte, requestTimeout time.Duration, waitTimeout time.Duration, pollInterval time.Duration) error { 157 - var ( 158 - expr = regexp.MustCompile(`<html><body>`) 159 - until = time.Now().Add(waitTimeout) 160 - d = time.Now().Sub(until) 161 - err error 162 - ) 163 - 164 - if body != nil && !expr.Match(body) { 165 - // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Printf("Detected crawl completion after %s", d) 166 - if err := checkCrawlResult(body); err != nil { 167 - return err 168 - } 169 - return nil 170 - } 171 - 172 - // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Debug("Waiting for crawl to finish") 173 - for { 174 - if waitTimeout != time.Duration(0) && time.Now().After(until) { 175 - return fmt.Errorf("timed out after %s waiting for crawl to complete", waitTimeout) 176 - } 177 - 178 - _, body, err = doRequest("", url, nil, requestTimeout) 179 - 180 - d = time.Now().Sub(until) 181 - 182 - if err != nil { 183 - log.Printf("Non-fatal error while polling for crawl completion: %s (continuing on, waiting for %s so far)", err, d) 184 - } else if !expr.Match(body) { 185 - // log.WithField("url", url).WithField("wait-timeout", waitTimeout).WithField("poll-interval", pollInterval).Printf("Detected crawl completion after %s", d) 186 - break 187 - } 188 - 189 - time.Sleep(pollInterval) 190 - } 191 - return nil 192 - } 193 - 194 - // checkCrawlResult searches for known archive.is errors in HTML content. 195 - func checkCrawlResult(body []byte) error { 196 - doc, err := goquery.NewDocumentFromReader(bytes.NewBuffer(body)) 197 - if err != nil { 198 - return fmt.Errorf("crawl result check gq new doc: %s", err) 199 - } 200 - if block := doc.Find("html > body > div").First(); block != nil { 201 - if text := strings.Trim(block.Text(), "\r\n\t "); text == "Error: Network error." { 202 - return fmt.Errorf("archive.is crawl result: Network Error") 203 - } 204 - } 205 - return nil 206 - }
-74
archiveis/http.go
··· 1 - package archiveis 2 - 3 - import ( 4 - "fmt" 5 - "io" 6 - "net" 7 - "net/http" 8 - "strings" 9 - "time" 10 - ) 11 - 12 - func doRequest(method string, url string, body io.ReadCloser, timeout time.Duration) (*http.Response, []byte, error) { 13 - req, err := newRequest(method, url, body) 14 - if err != nil { 15 - return nil, nil, err 16 - } 17 - 18 - if method != "" && method != "get" { 19 - req.Header.Set("content-type", "application/x-www-form-urlencoded") 20 - } 21 - 22 - client := newClient(timeout) 23 - resp, err := client.Do(req) 24 - if err != nil { 25 - return resp, nil, fmt.Errorf("executing request: %s", err) 26 - } 27 - if resp.StatusCode/100 != 2 && resp.StatusCode/100 != 3 { 28 - return resp, nil, fmt.Errorf("%v request to %v received unhappy response status-code=%v", method, url, resp.StatusCode) 29 - } 30 - respBody, err := io.ReadAll(resp.Body) 31 - if err != nil { 32 - return resp, nil, fmt.Errorf("reading response body: %s", err) 33 - } 34 - if err := resp.Body.Close(); err != nil { 35 - return resp, respBody, fmt.Errorf("closing response body: %s", err) 36 - } 37 - return resp, respBody, nil 38 - } 39 - 40 - func newRequest(method string, url string, body io.ReadCloser) (*http.Request, error) { 41 - req, err := http.NewRequest(method, url, body) 42 - if err != nil { 43 - return nil, fmt.Errorf("creating %v request to %v: %s", method, url, err) 44 - } 45 - 46 - req.Host = HTTPHost 47 - 48 - hostname := strings.Split(BaseURL, "://")[1] 49 - req.Header.Set("Host", hostname) 50 - req.Header.Set("Origin", hostname) 51 - req.Header.Set("Authority", hostname) 52 - req.Header.Set("User-Agent", UserAgent) 53 - req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8") 54 - req.Header.Set("Referer", BaseURL+"/") 55 - 56 - return req, nil 57 - } 58 - 59 - func newClient(timeout time.Duration) *http.Client { 60 - c := &http.Client{ 61 - Timeout: timeout, 62 - Transport: &http.Transport{ 63 - Proxy: http.ProxyFromEnvironment, 64 - Dial: (&net.Dialer{ 65 - Timeout: timeout, 66 - KeepAlive: timeout, 67 - }).Dial, 68 - TLSHandshakeTimeout: timeout, 69 - ResponseHeaderTimeout: timeout, 70 - ExpectContinueTimeout: 1 * time.Second, 71 - }, 72 - } 73 - return c 74 - }
-1
archiveis/readme
··· 1 - forked from https://github.com/jaytaylor/archive.is
+1 -1
files/settings.tmpl.html
··· 32 32 33 33 https://100r.co/links/rss.xml 34 34 https://begriffs.com/atom.xml 35 - https://blog.passtheballsocrates.com/feed/ 35 + https://blog.stillgreenmoss.net/feed/ 36 36 https://facklambda.dev/atom.xml 37 37 https://herman.bearblog.dev/feed/ 38 38 https://j3s.sh/feed.atom
-3
go.mod
··· 3 3 go 1.22 4 4 5 5 require ( 6 - github.com/PuerkitoBio/goquery v1.9.1 7 6 github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394 8 7 github.com/glebarez/go-sqlite v1.21.2 9 8 golang.org/x/crypto v0.19.0 10 9 ) 11 10 12 11 require ( 13 - github.com/andybalholm/cascadia v1.3.2 // indirect 14 12 github.com/dustin/go-humanize v1.0.1 // indirect 15 13 github.com/google/uuid v1.3.0 // indirect 16 14 github.com/mattn/go-isatty v0.0.19 // indirect 17 15 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec // indirect 18 - golang.org/x/net v0.21.0 // indirect 19 16 golang.org/x/sys v0.17.0 // indirect 20 17 modernc.org/libc v1.24.1 // indirect 21 18 modernc.org/mathutil v1.6.0 // indirect
-40
go.sum
··· 1 - github.com/PuerkitoBio/goquery v1.9.1 h1:mTL6XjbJTZdpfL+Gwl5U2h1l9yEkJjhmlTeV9VPW7UI= 2 - github.com/PuerkitoBio/goquery v1.9.1/go.mod h1:cW1n6TmIMDoORQU5IU/P1T3tGFunOeXEpGP2WHRwkbY= 3 - github.com/andybalholm/cascadia v1.3.2 h1:3Xi6Dw5lHF15JtdcmAHD3i1+T8plmv7BQ/nsViSLyss= 4 - github.com/andybalholm/cascadia v1.3.2/go.mod h1:7gtRlve5FxPPgIgX36uWBX58OdBsSS6lUvCFb+h7KvU= 5 1 github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394 h1:OYA+5W64v3OgClL+IrOD63t4i/RW7RqrAVl9LTZ9UqQ= 6 2 github.com/axgle/mahonia v0.0.0-20180208002826-3358181d7394/go.mod h1:Q8n74mJTIgjX4RBBcHnJ05h//6/k6foqmgE45jTQtxg= 7 3 github.com/dustin/go-humanize v1.0.1 h1:GzkhY7T5VNhEkwH0PVJgjz+fX1rhBrR7pRT3mDkpeCY= ··· 16 12 github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= 17 13 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec h1:W09IVJc94icq4NjY3clb7Lk8O1qJ8BdBEF8z0ibU0rE= 18 14 github.com/remyoudompheng/bigfft v0.0.0-20230129092748-24d4a6f8daec/go.mod h1:qqbHyh8v60DhA7CoWK5oRCqLrMHRGoxYCSS9EjAz6Eo= 19 - github.com/yuin/goldmark v1.4.13/go.mod h1:6yULJ656Px+3vBD8DxQVa3kxgyrAnzto9xy5taEt/CY= 20 - golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= 21 - golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= 22 15 golang.org/x/crypto v0.19.0 h1:ENy+Az/9Y1vSrlrvBSyna3PITt4tiZLf7sgCjZBX7Wo= 23 16 golang.org/x/crypto v0.19.0/go.mod h1:Iy9bg/ha4yyC70EfRS8jz+B6ybOBKMaSxLj6P6oBDfU= 24 - golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91VN4djpZkiMVwK6gcyfeH4XE8wZrZaV4= 25 - golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= 26 - golang.org/x/net v0.0.0-20190620200207-3b0461eec859/go.mod h1:z5CRVTTTmAJ677TzLLGU+0bjPO0LkuOLi4/5GtJWs/s= 27 - golang.org/x/net v0.0.0-20210226172049-e18ecbb05110/go.mod h1:m0MpNAwzfU5UDzcl9v0D8zg8gWTRqZa9RBIspLL5mdg= 28 - golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= 29 - golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= 30 - golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= 31 - golang.org/x/net v0.21.0 h1:AQyQV4dYCvJ7vGmJyKki9+PBdyvhkSd8EIx/qb0AYv4= 32 - golang.org/x/net v0.21.0/go.mod h1:bIjVDfnllIU7BJ2DNgfnXvpSvtn8VRwhlsaeUTyUS44= 33 - golang.org/x/sync v0.0.0-20190423024810-112230192c58/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 34 - golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 35 - golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= 36 - golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= 37 - golang.org/x/sys v0.0.0-20201119102817-f84b799fce68/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= 38 - golang.org/x/sys v0.0.0-20210615035016-665e8c7367d1/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 39 - golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 40 - golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 41 - golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 42 17 golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 43 - golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= 44 18 golang.org/x/sys v0.17.0 h1:25cE3gD+tdBA7lp7QfhuV+rJiE9YXTcS3VG1SqssI/Y= 45 19 golang.org/x/sys v0.17.0/go.mod h1:/VUhepiaJMQUp4+oa/7Zr1D23ma6VTLIYjOOTFZPUcA= 46 - golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= 47 - golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= 48 - golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= 49 - golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= 50 - golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= 51 - golang.org/x/text v0.3.3/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= 52 - golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= 53 - golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= 54 - golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= 55 - golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= 56 - golang.org/x/tools v0.0.0-20191119224855-298f0cb1881e/go.mod h1:b+2E5dAYhXwXZwtnZ6UAqBI28+e2cm9otk0dWdXHAEo= 57 - golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc= 58 - golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= 59 - golang.org/x/xerrors v0.0.0-20190717185122-a985d3407aa7/go.mod h1:I/5z698sn9Ka8TeJc9MKroUUfqBBauWjQqLJ2OPfmY0= 60 20 modernc.org/libc v1.24.1 h1:uvJSeCKL/AgzBo2yYIPPTy82v21KgGnizcGYfBHaNuM= 61 21 modernc.org/libc v1.24.1/go.mod h1:FmfO1RLrU3MHJfyi9eYYmZBfi/R+tqZ6+hQ3yQQUkak= 62 22 modernc.org/mathutil v1.6.0 h1:fRe9+AmYlaej+64JsEEhoWuAYBkOtQiMEU7n/XgfYi4=
+6 -2
site.go
··· 1 1 package main 2 2 3 3 import ( 4 + "context" 4 5 "errors" 5 6 "fmt" 6 7 "html/template" ··· 13 14 "strings" 14 15 "time" 15 16 16 - "git.j3s.sh/vore/archiveis" 17 17 "git.j3s.sh/vore/lib" 18 18 "git.j3s.sh/vore/reaper" 19 19 "git.j3s.sh/vore/rss" 20 20 "git.j3s.sh/vore/sqlite" 21 + "git.j3s.sh/vore/wayback" 21 22 "golang.org/x/crypto/bcrypt" 22 23 ) 23 24 ··· 138 139 fmt.Fprintf(w, "error!") 139 140 return 140 141 } 141 - archiveURL, err := archiveis.Capture(decodedURL) 142 + 143 + c := wayback.Client{} 144 + 145 + archiveURL, err := c.Archive(context.Background(), decodedURL) 142 146 if err != nil { 143 147 log.Println(err) 144 148 fmt.Fprintf(w, "error capturing archive!!")
+1
wayback/readme
··· 1 + forked from https://github.com/wabarc/archive.org/blob/main/ia.go
+128
wayback/wayback.go
··· 1 + package wayback 2 + 3 + import ( 4 + "context" 5 + "encoding/json" 6 + "fmt" 7 + "io" 8 + "net/http" 9 + "regexp" 10 + ) 11 + 12 + type Client struct { 13 + httpClient *http.Client 14 + } 15 + 16 + const userAgent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/83.0.4103.97 Safari/537.36" 17 + 18 + var ( 19 + host = "archive.org" 20 + dest = "https://web." + host 21 + base = "https://web.archive.org/save/" 22 + 23 + endpoint = "https://archive.org/wayback/available" 24 + ) 25 + 26 + // Wayback is the handle of saving webpages to archive.org 27 + func (wbrc *Client) Archive(ctx context.Context, u string) (result string, err error) { 28 + if wbrc.httpClient == nil { 29 + wbrc.httpClient = &http.Client{ 30 + CheckRedirect: noRedirect, 31 + } 32 + } 33 + 34 + result, err = wbrc.archive(ctx, u) 35 + if err != nil { 36 + return 37 + } 38 + return 39 + } 40 + 41 + func (wbrc *Client) archive(ctx context.Context, uri string) (string, error) { 42 + req, err := http.NewRequestWithContext(ctx, http.MethodGet, base+uri, nil) 43 + if err != nil { 44 + return "", err 45 + } 46 + req.Header.Add("User-Agent", userAgent) 47 + resp, err := wbrc.httpClient.Do(req) 48 + if err != nil { 49 + return "", err 50 + } 51 + defer resp.Body.Close() 52 + 53 + var loc string 54 + loc = resp.Header.Get("Content-Location") 55 + 56 + if len(loc) > 0 { 57 + return loc, nil 58 + } 59 + 60 + loc = resp.Header.Get("Location") 61 + if len(loc) > 0 { 62 + return loc, nil 63 + } 64 + 65 + links := resp.Header.Get("Link") 66 + re := regexp.MustCompile(`(?m)http[s]?:\/\/web\.archive\.org/web/[-a-zA-Z0-9@:%_\+.~#?&//=]*`) 67 + if match := re.FindAllString(links, -1); len(match) > 0 { 68 + loc = match[len(match)-1] 69 + return fmt.Sprintf("%v", loc), nil 70 + } 71 + 72 + loc = resp.Request.URL.String() 73 + if match := re.FindAllString(loc, -1); len(match) > 0 { 74 + return fmt.Sprintf("%v", loc), nil 75 + } 76 + 77 + loc, err = wbrc.latest(ctx, uri) 78 + if err != nil { 79 + loc = base + uri 80 + } 81 + 82 + // HTTP 509 Bandwidth Limit Exceeded 83 + if resp.StatusCode == 509 { 84 + return fmt.Sprint(loc), nil 85 + } 86 + 87 + if resp.StatusCode != 200 { 88 + return fmt.Sprint(loc), nil 89 + } 90 + 91 + return loc, nil 92 + } 93 + 94 + func (wbrc *Client) latest(_ context.Context, u string) (string, error) { 95 + // https://web.archive.org/*/https://example.org 96 + result := fmt.Sprintf("%s/*/%s", dest, u) 97 + 98 + uri := endpoint + "?url=" + u 99 + resp, err := wbrc.httpClient.Get(uri) 100 + if err != nil { 101 + return "", err 102 + } 103 + defer resp.Body.Close() 104 + 105 + data, err := io.ReadAll(resp.Body) 106 + if err != nil { 107 + return "", err 108 + } 109 + 110 + var dat map[string]interface{} 111 + if err := json.Unmarshal(data, &dat); err != nil { 112 + return "", err 113 + } 114 + 115 + if archived, ok := dat["archived_snapshots"].(map[string]interface{}); ok { 116 + if closest, ok := archived["closest"].(map[string]interface{}); ok { 117 + if closest["available"].(bool) && closest["status"] == "200" { 118 + return closest["url"].(string), nil 119 + } 120 + } 121 + } 122 + 123 + return result, fmt.Errorf("Not found") 124 + } 125 + 126 + func noRedirect(req *http.Request, via []*http.Request) error { 127 + return http.ErrUseLastResponse 128 + }