fork of go-gitdiff with jj support
at v0.7.2 15 kB view raw
1package gitdiff 2 3import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "mime/quotedprintable" 10 "net/mail" 11 "strconv" 12 "strings" 13 "time" 14 "unicode" 15) 16 17const ( 18 mailHeaderPrefix = "From " 19 prettyHeaderPrefix = "commit " 20 mailMinimumHeaderPrefix = "From:" 21) 22 23// PatchHeader is a parsed version of the preamble content that appears before 24// the first diff in a patch. It includes metadata about the patch, such as the 25// author and a subject. 26type PatchHeader struct { 27 // The SHA of the commit the patch was generated from. Empty if the SHA is 28 // not included in the header. 29 SHA string 30 31 // The author details of the patch. If these details are not included in 32 // the header, Author is nil and AuthorDate is the zero time. 33 Author *PatchIdentity 34 AuthorDate time.Time 35 36 // The committer details of the patch. If these details are not included in 37 // the header, Committer is nil and CommitterDate is the zero time. 38 Committer *PatchIdentity 39 CommitterDate time.Time 40 41 // The title and body of the commit message describing the changes in the 42 // patch. Empty if no message is included in the header. 43 Title string 44 Body string 45 46 // If the preamble looks like an email, ParsePatchHeader will 47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the 48 // Title and place them here. 49 SubjectPrefix string 50 51 // If the preamble looks like an email, and it contains a `---` 52 // line, that line will be removed and everything after it will be 53 // placed in BodyAppendix. 54 BodyAppendix string 55} 56 57// Message returns the commit message for the header. The message consists of 58// the title and the body separated by an empty line. 59func (h *PatchHeader) Message() string { 60 var msg strings.Builder 61 if h != nil { 62 msg.WriteString(h.Title) 63 if h.Body != "" { 64 msg.WriteString("\n\n") 65 msg.WriteString(h.Body) 66 } 67 } 68 return msg.String() 69} 70 71// PatchIdentity identifies a person who authored or committed a patch. 72type PatchIdentity struct { 73 Name string 74 Email string 75} 76 77func (i PatchIdentity) String() string { 78 name := i.Name 79 if name == "" { 80 name = `""` 81 } 82 return fmt.Sprintf("%s <%s>", name, i.Email) 83} 84 85// ParsePatchIdentity parses a patch identity string. A valid string contains 86// an optional name followed by an email address in angle brackets. The angle 87// brackets must always exist, but may enclose an empty address. At least one 88// of the name or the email address must be non-empty. If the string only 89// contains an email address, that value is also used as the name. 90// 91// The name must not contain a left angle bracket, '<', and the email address 92// must not contain a right angle bracket, '>'. Otherwise, there are no 93// restrictions on the format of either field. 94func ParsePatchIdentity(s string) (PatchIdentity, error) { 95 var emailStart, emailEnd int 96 for i, c := range s { 97 if c == '<' && emailStart == 0 { 98 emailStart = i + 1 99 } 100 if c == '>' && emailStart > 0 { 101 emailEnd = i 102 break 103 } 104 } 105 if emailStart > 0 && emailEnd == 0 { 106 return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s) 107 } 108 109 var name, email string 110 if emailStart > 0 { 111 name = strings.TrimSpace(s[:emailStart-1]) 112 } 113 if emailStart > 0 && emailEnd > 0 { 114 email = strings.TrimSpace(s[emailStart:emailEnd]) 115 } 116 if name == "" && email != "" { 117 name = email 118 } 119 120 if name == "" && email == "" { 121 return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s) 122 } 123 124 return PatchIdentity{Name: name, Email: email}, nil 125} 126 127// ParsePatchDate parses a patch date string. It returns the parsed time or an 128// error if s has an unknown format. ParsePatchDate supports the iso, rfc, 129// short, raw, unix, and default formats (with local variants) used by the 130// --date flag in Git. 131func ParsePatchDate(s string) (time.Time, error) { 132 const ( 133 isoFormat = "2006-01-02 15:04:05 -0700" 134 isoStrictFormat = "2006-01-02T15:04:05-07:00" 135 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700" 136 shortFormat = "2006-01-02" 137 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700" 138 defaultLocalFormat = "Mon Jan 2 15:04:05 2006" 139 ) 140 141 if s == "" { 142 return time.Time{}, nil 143 } 144 145 for _, fmt := range []string{ 146 isoFormat, 147 isoStrictFormat, 148 rfc2822Format, 149 shortFormat, 150 defaultFormat, 151 defaultLocalFormat, 152 } { 153 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil { 154 return t, nil 155 } 156 } 157 158 // unix format 159 if unix, err := strconv.ParseInt(s, 10, 64); err == nil { 160 return time.Unix(unix, 0), nil 161 } 162 163 // raw format 164 if space := strings.IndexByte(s, ' '); space > 0 { 165 unix, uerr := strconv.ParseInt(s[:space], 10, 64) 166 zone, zerr := time.Parse("-0700", s[space+1:]) 167 if uerr == nil && zerr == nil { 168 return time.Unix(unix, 0).In(zone.Location()), nil 169 } 170 } 171 172 return time.Time{}, fmt.Errorf("unknown date format: %s", s) 173} 174 175// A PatchHeaderOption modifies the behavior of ParsePatchHeader. 176type PatchHeaderOption func(*patchHeaderOptions) 177 178// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when 179// parsing mail-formatted patches. 180type SubjectCleanMode int 181 182const ( 183 // SubjectCleanWhitespace removes leading and trailing whitespace. 184 SubjectCleanWhitespace SubjectCleanMode = iota 185 186 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:", 187 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'. 188 // This is the default behavior of git (see `git mailinfo`) and this 189 // package. 190 SubjectCleanAll 191 192 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes 193 // leading strings enclosed by '[' and ']' if they start with "PATCH". 194 SubjectCleanPatchOnly 195) 196 197// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By 198// default, uses SubjectCleanAll. 199func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption { 200 return func(opts *patchHeaderOptions) { 201 opts.subjectCleanMode = m 202 } 203} 204 205type patchHeaderOptions struct { 206 subjectCleanMode SubjectCleanMode 207} 208 209// ParsePatchHeader parses the preamble string returned by [Parse] into a 210// PatchHeader. Due to the variety of header formats, some fields of the parsed 211// PatchHeader may be unset after parsing. 212// 213// Supported formats are the short, medium, full, fuller, and email pretty 214// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox 215// format used by `git format-patch`. 216// 217// When parsing mail-formatted headers, ParsePatchHeader tries to remove 218// email-specific content from the title and body: 219// 220// - Based on the SubjectCleanMode, remove prefixes like reply markers and 221// "[PATCH]" strings from the subject, saving any removed content in the 222// SubjectPrefix field. Parsing always discards leading and trailing 223// whitespace from the subject line. The default mode is SubjectCleanAll. 224// 225// - If the body contains a "---" line (3 hyphens), remove that line and any 226// content after it from the body and save it in the BodyAppendix field. 227// 228// ParsePatchHeader tries to process content it does not understand wthout 229// returning errors, but will return errors if well-identified content like 230// dates or identies uses unknown or invalid formats. 231func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) { 232 opts := patchHeaderOptions{ 233 subjectCleanMode: SubjectCleanAll, // match git defaults 234 } 235 for _, optFn := range options { 236 optFn(&opts) 237 } 238 239 header = strings.TrimSpace(header) 240 if header == "" { 241 return &PatchHeader{}, nil 242 } 243 244 var firstLine, rest string 245 if idx := strings.IndexByte(header, '\n'); idx >= 0 { 246 firstLine = header[:idx] 247 rest = header[idx+1:] 248 } else { 249 firstLine = header 250 rest = "" 251 } 252 253 switch { 254 case strings.HasPrefix(firstLine, mailHeaderPrefix): 255 return parseHeaderMail(firstLine, strings.NewReader(rest), opts) 256 257 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix): 258 // With a minimum header, the first line is part of the actual mail 259 // content and needs to be parsed as part of the "rest" 260 return parseHeaderMail("", strings.NewReader(header), opts) 261 262 case strings.HasPrefix(firstLine, prettyHeaderPrefix): 263 return parseHeaderPretty(firstLine, strings.NewReader(rest)) 264 } 265 266 return nil, errors.New("unrecognized patch header format") 267} 268 269func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) { 270 const ( 271 authorPrefix = "Author:" 272 commitPrefix = "Commit:" 273 datePrefix = "Date:" 274 authorDatePrefix = "AuthorDate:" 275 commitDatePrefix = "CommitDate:" 276 ) 277 278 h := &PatchHeader{} 279 280 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix) 281 if i := strings.IndexByte(prettyLine, ' '); i > 0 { 282 h.SHA = prettyLine[:i] 283 } else { 284 h.SHA = prettyLine 285 } 286 287 s := bufio.NewScanner(r) 288 for s.Scan() { 289 line := s.Text() 290 291 // empty line marks end of fields, remaining lines are title/message 292 if strings.TrimSpace(line) == "" { 293 break 294 } 295 296 switch { 297 case strings.HasPrefix(line, authorPrefix): 298 u, err := ParsePatchIdentity(line[len(authorPrefix):]) 299 if err != nil { 300 return nil, err 301 } 302 h.Author = &u 303 304 case strings.HasPrefix(line, commitPrefix): 305 u, err := ParsePatchIdentity(line[len(commitPrefix):]) 306 if err != nil { 307 return nil, err 308 } 309 h.Committer = &u 310 311 case strings.HasPrefix(line, datePrefix): 312 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):])) 313 if err != nil { 314 return nil, err 315 } 316 h.AuthorDate = d 317 318 case strings.HasPrefix(line, authorDatePrefix): 319 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):])) 320 if err != nil { 321 return nil, err 322 } 323 h.AuthorDate = d 324 325 case strings.HasPrefix(line, commitDatePrefix): 326 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):])) 327 if err != nil { 328 return nil, err 329 } 330 h.CommitterDate = d 331 } 332 } 333 if s.Err() != nil { 334 return nil, s.Err() 335 } 336 337 title, indent := scanMessageTitle(s) 338 if s.Err() != nil { 339 return nil, s.Err() 340 } 341 h.Title = title 342 343 if title != "" { 344 // Don't check for an appendix, pretty headers do not contain them 345 body, _ := scanMessageBody(s, indent, false) 346 if s.Err() != nil { 347 return nil, s.Err() 348 } 349 h.Body = body 350 } 351 352 return h, nil 353} 354 355func scanMessageTitle(s *bufio.Scanner) (title string, indent string) { 356 var b strings.Builder 357 for i := 0; s.Scan(); i++ { 358 line := s.Text() 359 trimLine := strings.TrimSpace(line) 360 if trimLine == "" { 361 break 362 } 363 364 if i == 0 { 365 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 { 366 indent = line[:start] 367 } 368 } 369 if b.Len() > 0 { 370 b.WriteByte(' ') 371 } 372 b.WriteString(trimLine) 373 } 374 return b.String(), indent 375} 376 377func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) { 378 // Body and appendix 379 var body, appendix strings.Builder 380 c := &body 381 var empty int 382 for i := 0; s.Scan(); i++ { 383 line := s.Text() 384 385 line = strings.TrimRightFunc(line, unicode.IsSpace) 386 line = strings.TrimPrefix(line, indent) 387 388 if line == "" { 389 empty++ 390 continue 391 } 392 393 // If requested, parse out "appendix" information (often added 394 // by `git format-patch` and removed by `git am`). 395 if separateAppendix && c == &body && line == "---" { 396 c = &appendix 397 continue 398 } 399 400 if c.Len() > 0 { 401 c.WriteByte('\n') 402 if empty > 0 { 403 c.WriteByte('\n') 404 } 405 } 406 empty = 0 407 408 c.WriteString(line) 409 } 410 return body.String(), appendix.String() 411} 412 413func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) { 414 msg, err := mail.ReadMessage(r) 415 if err != nil { 416 return nil, err 417 } 418 419 h := &PatchHeader{} 420 421 if strings.HasPrefix(mailLine, mailHeaderPrefix) { 422 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix) 423 if i := strings.IndexByte(mailLine, ' '); i > 0 { 424 h.SHA = mailLine[:i] 425 } 426 } 427 428 addrs, err := msg.Header.AddressList("From") 429 if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) { 430 return nil, err 431 } 432 if len(addrs) > 0 { 433 addr := addrs[0] 434 if addr.Name == "" { 435 addr.Name = addr.Address 436 } 437 h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address} 438 } 439 440 date := msg.Header.Get("Date") 441 if date != "" { 442 d, err := ParsePatchDate(date) 443 if err != nil { 444 return nil, err 445 } 446 h.AuthorDate = d 447 } 448 449 subject := msg.Header.Get("Subject") 450 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode) 451 452 s := bufio.NewScanner(msg.Body) 453 h.Body, h.BodyAppendix = scanMessageBody(s, "", true) 454 if s.Err() != nil { 455 return nil, s.Err() 456 } 457 458 return h, nil 459} 460 461func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) { 462 switch mode { 463 case SubjectCleanAll, SubjectCleanPatchOnly: 464 case SubjectCleanWhitespace: 465 return "", strings.TrimSpace(decodeSubject(s)) 466 default: 467 panic(fmt.Sprintf("unknown clean mode: %d", mode)) 468 } 469 470 // Based on the algorithm from Git in mailinfo.c:cleanup_subject() 471 // If compatibility with `git am` drifts, go there to see if there are any updates. 472 473 at := 0 474 for at < len(s) { 475 switch s[at] { 476 case 'r', 'R': 477 // Detect re:, Re:, rE: and RE: 478 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' { 479 at += 3 480 continue 481 } 482 483 case ' ', '\t', ':': 484 // Delete whitespace and duplicate ':' characters 485 at++ 486 continue 487 488 case '[': 489 if i := strings.IndexByte(s[at:], ']'); i > 0 { 490 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") { 491 at += i + 1 492 continue 493 } 494 } 495 } 496 497 // Nothing was removed, end processing 498 break 499 } 500 501 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace) 502 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace) 503 return 504} 505 506// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result 507// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji). 508// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject 509func decodeSubject(encoded string) string { 510 if !strings.HasPrefix(encoded, "=?UTF-8?q?") { 511 // not UTF-8 encoded 512 return encoded 513 } 514 515 // If the subject is too long, `git format-patch` may produce a subject line across 516 // multiple lines. When parsed, this can look like the following: 517 // <UTF8-prefix><first-line> <UTF8-prefix><second-line> 518 payload := " " + encoded 519 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "") 520 payload = strings.ReplaceAll(payload, "?=", "") 521 522 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload))) 523 if err != nil { 524 // if err, abort decoding and return original subject 525 return encoded 526 } 527 528 return string(decoded) 529}