fork of go-gitdiff with jj support
at v0.8.2 13 kB view raw
1package gitdiff 2 3import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "mime/quotedprintable" 10 "net/mail" 11 "strconv" 12 "strings" 13 "time" 14 "unicode" 15) 16 17const ( 18 mailHeaderPrefix = "From " 19 prettyHeaderPrefix = "commit " 20 mailMinimumHeaderPrefix = "From:" 21) 22 23// PatchHeader is a parsed version of the preamble content that appears before 24// the first diff in a patch. It includes metadata about the patch, such as the 25// author and a subject. 26type PatchHeader struct { 27 // The SHA of the commit the patch was generated from. Empty if the SHA is 28 // not included in the header. 29 SHA string 30 31 // The author details of the patch. If these details are not included in 32 // the header, Author is nil and AuthorDate is the zero time. 33 Author *PatchIdentity 34 AuthorDate time.Time 35 36 // The committer details of the patch. If these details are not included in 37 // the header, Committer is nil and CommitterDate is the zero time. 38 Committer *PatchIdentity 39 CommitterDate time.Time 40 41 // The title and body of the commit message describing the changes in the 42 // patch. Empty if no message is included in the header. 43 Title string 44 Body string 45 46 // If the preamble looks like an email, ParsePatchHeader will 47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the 48 // Title and place them here. 49 SubjectPrefix string 50 51 // If the preamble looks like an email, and it contains a `---` 52 // line, that line will be removed and everything after it will be 53 // placed in BodyAppendix. 54 BodyAppendix string 55 56 // All headers completely unparsed 57 RawHeaders map[string][]string 58} 59 60// Message returns the commit message for the header. The message consists of 61// the title and the body separated by an empty line. 62func (h *PatchHeader) Message() string { 63 var msg strings.Builder 64 if h != nil { 65 msg.WriteString(h.Title) 66 if h.Body != "" { 67 msg.WriteString("\n\n") 68 msg.WriteString(h.Body) 69 } 70 } 71 return msg.String() 72} 73 74// ParsePatchDate parses a patch date string. It returns the parsed time or an 75// error if s has an unknown format. ParsePatchDate supports the iso, rfc, 76// short, raw, unix, and default formats (with local variants) used by the 77// --date flag in Git. 78func ParsePatchDate(s string) (time.Time, error) { 79 const ( 80 isoFormat = "2006-01-02 15:04:05 -0700" 81 isoStrictFormat = "2006-01-02T15:04:05-07:00" 82 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700" 83 shortFormat = "2006-01-02" 84 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700" 85 defaultLocalFormat = "Mon Jan 2 15:04:05 2006" 86 ) 87 88 if s == "" { 89 return time.Time{}, nil 90 } 91 92 for _, fmt := range []string{ 93 isoFormat, 94 isoStrictFormat, 95 rfc2822Format, 96 shortFormat, 97 defaultFormat, 98 defaultLocalFormat, 99 } { 100 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil { 101 return t, nil 102 } 103 } 104 105 // unix format 106 if unix, err := strconv.ParseInt(s, 10, 64); err == nil { 107 return time.Unix(unix, 0), nil 108 } 109 110 // raw format 111 if space := strings.IndexByte(s, ' '); space > 0 { 112 unix, uerr := strconv.ParseInt(s[:space], 10, 64) 113 zone, zerr := time.Parse("-0700", s[space+1:]) 114 if uerr == nil && zerr == nil { 115 return time.Unix(unix, 0).In(zone.Location()), nil 116 } 117 } 118 119 return time.Time{}, fmt.Errorf("unknown date format: %s", s) 120} 121 122// A PatchHeaderOption modifies the behavior of ParsePatchHeader. 123type PatchHeaderOption func(*patchHeaderOptions) 124 125// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when 126// parsing mail-formatted patches. 127type SubjectCleanMode int 128 129const ( 130 // SubjectCleanWhitespace removes leading and trailing whitespace. 131 SubjectCleanWhitespace SubjectCleanMode = iota 132 133 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:", 134 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'. 135 // This is the default behavior of git (see `git mailinfo`) and this 136 // package. 137 SubjectCleanAll 138 139 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes 140 // leading strings enclosed by '[' and ']' if they start with "PATCH". 141 SubjectCleanPatchOnly 142) 143 144// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By 145// default, uses SubjectCleanAll. 146func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption { 147 return func(opts *patchHeaderOptions) { 148 opts.subjectCleanMode = m 149 } 150} 151 152type patchHeaderOptions struct { 153 subjectCleanMode SubjectCleanMode 154} 155 156// ParsePatchHeader parses the preamble string returned by [Parse] into a 157// PatchHeader. Due to the variety of header formats, some fields of the parsed 158// PatchHeader may be unset after parsing. 159// 160// Supported formats are the short, medium, full, fuller, and email pretty 161// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox 162// format used by `git format-patch`. 163// 164// When parsing mail-formatted headers, ParsePatchHeader tries to remove 165// email-specific content from the title and body: 166// 167// - Based on the SubjectCleanMode, remove prefixes like reply markers and 168// "[PATCH]" strings from the subject, saving any removed content in the 169// SubjectPrefix field. Parsing always discards leading and trailing 170// whitespace from the subject line. The default mode is SubjectCleanAll. 171// 172// - If the body contains a "---" line (3 hyphens), remove that line and any 173// content after it from the body and save it in the BodyAppendix field. 174// 175// ParsePatchHeader tries to process content it does not understand wthout 176// returning errors, but will return errors if well-identified content like 177// dates or identies uses unknown or invalid formats. 178func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) { 179 opts := patchHeaderOptions{ 180 subjectCleanMode: SubjectCleanAll, // match git defaults 181 } 182 for _, optFn := range options { 183 optFn(&opts) 184 } 185 186 header = strings.TrimSpace(header) 187 if header == "" { 188 return &PatchHeader{}, nil 189 } 190 191 var firstLine, rest string 192 if idx := strings.IndexByte(header, '\n'); idx >= 0 { 193 firstLine = header[:idx] 194 rest = header[idx+1:] 195 } else { 196 firstLine = header 197 rest = "" 198 } 199 200 switch { 201 case strings.HasPrefix(firstLine, mailHeaderPrefix): 202 return parseHeaderMail(firstLine, strings.NewReader(rest), opts) 203 204 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix): 205 // With a minimum header, the first line is part of the actual mail 206 // content and needs to be parsed as part of the "rest" 207 return parseHeaderMail("", strings.NewReader(header), opts) 208 209 case strings.HasPrefix(firstLine, prettyHeaderPrefix): 210 return parseHeaderPretty(firstLine, strings.NewReader(rest)) 211 } 212 213 return nil, errors.New("unrecognized patch header format") 214} 215 216func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) { 217 const ( 218 authorPrefix = "Author:" 219 commitPrefix = "Commit:" 220 datePrefix = "Date:" 221 authorDatePrefix = "AuthorDate:" 222 commitDatePrefix = "CommitDate:" 223 ) 224 225 h := &PatchHeader{} 226 227 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix) 228 if i := strings.IndexByte(prettyLine, ' '); i > 0 { 229 h.SHA = prettyLine[:i] 230 } else { 231 h.SHA = prettyLine 232 } 233 234 s := bufio.NewScanner(r) 235 for s.Scan() { 236 line := s.Text() 237 238 // empty line marks end of fields, remaining lines are title/message 239 if strings.TrimSpace(line) == "" { 240 break 241 } 242 243 items := strings.SplitN(line, ":", 2) 244 245 // we have "key: value" 246 if len(items) == 2 { 247 key := items[0] 248 val := items[1] 249 h.RawHeaders[key] = append(h.RawHeaders[key], val) 250 } 251 252 switch { 253 case strings.HasPrefix(line, authorPrefix): 254 u, err := ParsePatchIdentity(line[len(authorPrefix):]) 255 if err != nil { 256 return nil, err 257 } 258 h.Author = &u 259 260 case strings.HasPrefix(line, commitPrefix): 261 u, err := ParsePatchIdentity(line[len(commitPrefix):]) 262 if err != nil { 263 return nil, err 264 } 265 h.Committer = &u 266 267 case strings.HasPrefix(line, datePrefix): 268 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):])) 269 if err != nil { 270 return nil, err 271 } 272 h.AuthorDate = d 273 274 case strings.HasPrefix(line, authorDatePrefix): 275 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):])) 276 if err != nil { 277 return nil, err 278 } 279 h.AuthorDate = d 280 281 case strings.HasPrefix(line, commitDatePrefix): 282 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):])) 283 if err != nil { 284 return nil, err 285 } 286 h.CommitterDate = d 287 } 288 } 289 if s.Err() != nil { 290 return nil, s.Err() 291 } 292 293 title, indent := scanMessageTitle(s) 294 if s.Err() != nil { 295 return nil, s.Err() 296 } 297 h.Title = title 298 299 if title != "" { 300 // Don't check for an appendix, pretty headers do not contain them 301 body, _ := scanMessageBody(s, indent, false) 302 if s.Err() != nil { 303 return nil, s.Err() 304 } 305 h.Body = body 306 } 307 308 return h, nil 309} 310 311func scanMessageTitle(s *bufio.Scanner) (title string, indent string) { 312 var b strings.Builder 313 for i := 0; s.Scan(); i++ { 314 line := s.Text() 315 trimLine := strings.TrimSpace(line) 316 if trimLine == "" { 317 break 318 } 319 320 if i == 0 { 321 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 { 322 indent = line[:start] 323 } 324 } 325 if b.Len() > 0 { 326 b.WriteByte(' ') 327 } 328 b.WriteString(trimLine) 329 } 330 return b.String(), indent 331} 332 333func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) { 334 // Body and appendix 335 var body, appendix strings.Builder 336 c := &body 337 var empty int 338 for i := 0; s.Scan(); i++ { 339 line := s.Text() 340 341 line = strings.TrimRightFunc(line, unicode.IsSpace) 342 line = strings.TrimPrefix(line, indent) 343 344 if line == "" { 345 empty++ 346 continue 347 } 348 349 // If requested, parse out "appendix" information (often added 350 // by `git format-patch` and removed by `git am`). 351 if separateAppendix && c == &body && line == "---" { 352 c = &appendix 353 continue 354 } 355 356 if c.Len() > 0 { 357 c.WriteByte('\n') 358 if empty > 0 { 359 c.WriteByte('\n') 360 } 361 } 362 empty = 0 363 364 c.WriteString(line) 365 } 366 return body.String(), appendix.String() 367} 368 369func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) { 370 msg, err := mail.ReadMessage(r) 371 if err != nil { 372 return nil, err 373 } 374 375 h := &PatchHeader{} 376 h.RawHeaders = msg.Header 377 378 if strings.HasPrefix(mailLine, mailHeaderPrefix) { 379 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix) 380 if i := strings.IndexByte(mailLine, ' '); i > 0 { 381 h.SHA = mailLine[:i] 382 } 383 } 384 385 from := msg.Header.Get("From") 386 if from != "" { 387 u, err := ParsePatchIdentity(from) 388 if err != nil { 389 return nil, err 390 } 391 h.Author = &u 392 } 393 394 date := msg.Header.Get("Date") 395 if date != "" { 396 d, err := ParsePatchDate(date) 397 if err != nil { 398 return nil, err 399 } 400 h.AuthorDate = d 401 } 402 403 subject := msg.Header.Get("Subject") 404 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode) 405 406 s := bufio.NewScanner(msg.Body) 407 h.Body, h.BodyAppendix = scanMessageBody(s, "", true) 408 if s.Err() != nil { 409 return nil, s.Err() 410 } 411 412 return h, nil 413} 414 415func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) { 416 switch mode { 417 case SubjectCleanAll, SubjectCleanPatchOnly: 418 case SubjectCleanWhitespace: 419 return "", strings.TrimSpace(decodeSubject(s)) 420 default: 421 panic(fmt.Sprintf("unknown clean mode: %d", mode)) 422 } 423 424 // Based on the algorithm from Git in mailinfo.c:cleanup_subject() 425 // If compatibility with `git am` drifts, go there to see if there are any updates. 426 427 at := 0 428 for at < len(s) { 429 switch s[at] { 430 case 'r', 'R': 431 // Detect re:, Re:, rE: and RE: 432 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' { 433 at += 3 434 continue 435 } 436 437 case ' ', '\t', ':': 438 // Delete whitespace and duplicate ':' characters 439 at++ 440 continue 441 442 case '[': 443 if i := strings.IndexByte(s[at:], ']'); i > 0 { 444 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") { 445 at += i + 1 446 continue 447 } 448 } 449 } 450 451 // Nothing was removed, end processing 452 break 453 } 454 455 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace) 456 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace) 457 return 458} 459 460// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result 461// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji). 462// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject 463func decodeSubject(encoded string) string { 464 if !strings.HasPrefix(encoded, "=?UTF-8?q?") { 465 // not UTF-8 encoded 466 return encoded 467 } 468 469 // If the subject is too long, `git format-patch` may produce a subject line across 470 // multiple lines. When parsed, this can look like the following: 471 // <UTF8-prefix><first-line> <UTF8-prefix><second-line> 472 payload := " " + encoded 473 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "") 474 payload = strings.ReplaceAll(payload, "?=", "") 475 476 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload))) 477 if err != nil { 478 // if err, abort decoding and return original subject 479 return encoded 480 } 481 482 return string(decoded) 483}