fork of go-gitdiff with jj support
at v0.7.1 14 kB view raw
1package gitdiff 2 3import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "mime/quotedprintable" 10 "net/mail" 11 "strconv" 12 "strings" 13 "time" 14 "unicode" 15) 16 17const ( 18 mailHeaderPrefix = "From " 19 prettyHeaderPrefix = "commit " 20 mailMinimumHeaderPrefix = "From:" 21) 22 23// PatchHeader is a parsed version of the preamble content that appears before 24// the first diff in a patch. It includes metadata about the patch, such as the 25// author and a subject. 26type PatchHeader struct { 27 // The SHA of the commit the patch was generated from. Empty if the SHA is 28 // not included in the header. 29 SHA string 30 31 // The author details of the patch. If these details are not included in 32 // the header, Author is nil and AuthorDate is the zero time. 33 Author *PatchIdentity 34 AuthorDate time.Time 35 36 // The committer details of the patch. If these details are not included in 37 // the header, Committer is nil and CommitterDate is the zero time. 38 Committer *PatchIdentity 39 CommitterDate time.Time 40 41 // The title and body of the commit message describing the changes in the 42 // patch. Empty if no message is included in the header. 43 Title string 44 Body string 45 46 // If the preamble looks like an email, ParsePatchHeader will 47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the 48 // Title and place them here. 49 SubjectPrefix string 50 51 // If the preamble looks like an email, and it contains a `---` 52 // line, that line will be removed and everything after it will be 53 // placed in BodyAppendix. 54 BodyAppendix string 55} 56 57// Message returns the commit message for the header. The message consists of 58// the title and the body separated by an empty line. 59func (h *PatchHeader) Message() string { 60 var msg strings.Builder 61 if h != nil { 62 msg.WriteString(h.Title) 63 if h.Body != "" { 64 msg.WriteString("\n\n") 65 msg.WriteString(h.Body) 66 } 67 } 68 return msg.String() 69} 70 71// PatchIdentity identifies a person who authored or committed a patch. 72type PatchIdentity struct { 73 Name string 74 Email string 75} 76 77func (i PatchIdentity) String() string { 78 name := i.Name 79 if name == "" { 80 name = `""` 81 } 82 return fmt.Sprintf("%s <%s>", name, i.Email) 83} 84 85// ParsePatchIdentity parses a patch identity string. A valid string contains a 86// non-empty name followed by an email address in angle brackets. Like Git, 87// ParsePatchIdentity does not require that the email address is valid or 88// properly formatted, only that it is non-empty. The name must not contain a 89// left angle bracket, '<', and the email address must not contain a right 90// angle bracket, '>'. 91func ParsePatchIdentity(s string) (PatchIdentity, error) { 92 var emailStart, emailEnd int 93 for i, c := range s { 94 if c == '<' && emailStart == 0 { 95 emailStart = i + 1 96 } 97 if c == '>' && emailStart > 0 { 98 emailEnd = i 99 break 100 } 101 } 102 if emailStart > 0 && emailEnd == 0 { 103 return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s) 104 } 105 106 var name, email string 107 if emailStart > 0 { 108 name = strings.TrimSpace(s[:emailStart-1]) 109 } 110 if emailStart > 0 && emailEnd > 0 { 111 email = strings.TrimSpace(s[emailStart:emailEnd]) 112 } 113 if name == "" || email == "" { 114 return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s) 115 } 116 117 return PatchIdentity{Name: name, Email: email}, nil 118} 119 120// ParsePatchDate parses a patch date string. It returns the parsed time or an 121// error if s has an unknown format. ParsePatchDate supports the iso, rfc, 122// short, raw, unix, and default formats (with local variants) used by the 123// --date flag in Git. 124func ParsePatchDate(s string) (time.Time, error) { 125 const ( 126 isoFormat = "2006-01-02 15:04:05 -0700" 127 isoStrictFormat = "2006-01-02T15:04:05-07:00" 128 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700" 129 shortFormat = "2006-01-02" 130 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700" 131 defaultLocalFormat = "Mon Jan 2 15:04:05 2006" 132 ) 133 134 if s == "" { 135 return time.Time{}, nil 136 } 137 138 for _, fmt := range []string{ 139 isoFormat, 140 isoStrictFormat, 141 rfc2822Format, 142 shortFormat, 143 defaultFormat, 144 defaultLocalFormat, 145 } { 146 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil { 147 return t, nil 148 } 149 } 150 151 // unix format 152 if unix, err := strconv.ParseInt(s, 10, 64); err == nil { 153 return time.Unix(unix, 0), nil 154 } 155 156 // raw format 157 if space := strings.IndexByte(s, ' '); space > 0 { 158 unix, uerr := strconv.ParseInt(s[:space], 10, 64) 159 zone, zerr := time.Parse("-0700", s[space+1:]) 160 if uerr == nil && zerr == nil { 161 return time.Unix(unix, 0).In(zone.Location()), nil 162 } 163 } 164 165 return time.Time{}, fmt.Errorf("unknown date format: %s", s) 166} 167 168// A PatchHeaderOption modifies the behavior of ParsePatchHeader. 169type PatchHeaderOption func(*patchHeaderOptions) 170 171// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when 172// parsing mail-formatted patches. 173type SubjectCleanMode int 174 175const ( 176 // SubjectCleanWhitespace removes leading and trailing whitespace. 177 SubjectCleanWhitespace SubjectCleanMode = iota 178 179 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:", 180 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'. 181 // This is the default behavior of git (see `git mailinfo`) and this 182 // package. 183 SubjectCleanAll 184 185 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes 186 // leading strings enclosed by '[' and ']' if they start with "PATCH". 187 SubjectCleanPatchOnly 188) 189 190// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By 191// default, uses SubjectCleanAll. 192func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption { 193 return func(opts *patchHeaderOptions) { 194 opts.subjectCleanMode = m 195 } 196} 197 198type patchHeaderOptions struct { 199 subjectCleanMode SubjectCleanMode 200} 201 202// ParsePatchHeader parses the preamble string returned by [Parse] into a 203// PatchHeader. Due to the variety of header formats, some fields of the parsed 204// PatchHeader may be unset after parsing. 205// 206// Supported formats are the short, medium, full, fuller, and email pretty 207// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox 208// format used by `git format-patch`. 209// 210// When parsing mail-formatted headers, ParsePatchHeader tries to remove 211// email-specific content from the title and body: 212// 213// - Based on the SubjectCleanMode, remove prefixes like reply markers and 214// "[PATCH]" strings from the subject, saving any removed content in the 215// SubjectPrefix field. Parsing always discards leading and trailing 216// whitespace from the subject line. The default mode is SubjectCleanAll. 217// 218// - If the body contains a "---" line (3 hyphens), remove that line and any 219// content after it from the body and save it in the BodyAppendix field. 220// 221// ParsePatchHeader tries to process content it does not understand wthout 222// returning errors, but will return errors if well-identified content like 223// dates or identies uses unknown or invalid formats. 224func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) { 225 opts := patchHeaderOptions{ 226 subjectCleanMode: SubjectCleanAll, // match git defaults 227 } 228 for _, optFn := range options { 229 optFn(&opts) 230 } 231 232 header = strings.TrimSpace(header) 233 if header == "" { 234 return &PatchHeader{}, nil 235 } 236 237 var firstLine, rest string 238 if idx := strings.IndexByte(header, '\n'); idx >= 0 { 239 firstLine = header[:idx] 240 rest = header[idx+1:] 241 } else { 242 firstLine = header 243 rest = "" 244 } 245 246 switch { 247 case strings.HasPrefix(firstLine, mailHeaderPrefix): 248 return parseHeaderMail(firstLine, strings.NewReader(rest), opts) 249 250 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix): 251 // With a minimum header, the first line is part of the actual mail 252 // content and needs to be parsed as part of the "rest" 253 return parseHeaderMail("", strings.NewReader(header), opts) 254 255 case strings.HasPrefix(firstLine, prettyHeaderPrefix): 256 return parseHeaderPretty(firstLine, strings.NewReader(rest)) 257 } 258 259 return nil, errors.New("unrecognized patch header format") 260} 261 262func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) { 263 const ( 264 authorPrefix = "Author:" 265 commitPrefix = "Commit:" 266 datePrefix = "Date:" 267 authorDatePrefix = "AuthorDate:" 268 commitDatePrefix = "CommitDate:" 269 ) 270 271 h := &PatchHeader{} 272 273 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix) 274 if i := strings.IndexByte(prettyLine, ' '); i > 0 { 275 h.SHA = prettyLine[:i] 276 } else { 277 h.SHA = prettyLine 278 } 279 280 s := bufio.NewScanner(r) 281 for s.Scan() { 282 line := s.Text() 283 284 // empty line marks end of fields, remaining lines are title/message 285 if strings.TrimSpace(line) == "" { 286 break 287 } 288 289 switch { 290 case strings.HasPrefix(line, authorPrefix): 291 u, err := ParsePatchIdentity(line[len(authorPrefix):]) 292 if err != nil { 293 return nil, err 294 } 295 h.Author = &u 296 297 case strings.HasPrefix(line, commitPrefix): 298 u, err := ParsePatchIdentity(line[len(commitPrefix):]) 299 if err != nil { 300 return nil, err 301 } 302 h.Committer = &u 303 304 case strings.HasPrefix(line, datePrefix): 305 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):])) 306 if err != nil { 307 return nil, err 308 } 309 h.AuthorDate = d 310 311 case strings.HasPrefix(line, authorDatePrefix): 312 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):])) 313 if err != nil { 314 return nil, err 315 } 316 h.AuthorDate = d 317 318 case strings.HasPrefix(line, commitDatePrefix): 319 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):])) 320 if err != nil { 321 return nil, err 322 } 323 h.CommitterDate = d 324 } 325 } 326 if s.Err() != nil { 327 return nil, s.Err() 328 } 329 330 title, indent := scanMessageTitle(s) 331 if s.Err() != nil { 332 return nil, s.Err() 333 } 334 h.Title = title 335 336 if title != "" { 337 // Don't check for an appendix, pretty headers do not contain them 338 body, _ := scanMessageBody(s, indent, false) 339 if s.Err() != nil { 340 return nil, s.Err() 341 } 342 h.Body = body 343 } 344 345 return h, nil 346} 347 348func scanMessageTitle(s *bufio.Scanner) (title string, indent string) { 349 var b strings.Builder 350 for i := 0; s.Scan(); i++ { 351 line := s.Text() 352 trimLine := strings.TrimSpace(line) 353 if trimLine == "" { 354 break 355 } 356 357 if i == 0 { 358 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 { 359 indent = line[:start] 360 } 361 } 362 if b.Len() > 0 { 363 b.WriteByte(' ') 364 } 365 b.WriteString(trimLine) 366 } 367 return b.String(), indent 368} 369 370func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) { 371 // Body and appendix 372 var body, appendix strings.Builder 373 c := &body 374 var empty int 375 for i := 0; s.Scan(); i++ { 376 line := s.Text() 377 378 line = strings.TrimRightFunc(line, unicode.IsSpace) 379 line = strings.TrimPrefix(line, indent) 380 381 if line == "" { 382 empty++ 383 continue 384 } 385 386 // If requested, parse out "appendix" information (often added 387 // by `git format-patch` and removed by `git am`). 388 if separateAppendix && c == &body && line == "---" { 389 c = &appendix 390 continue 391 } 392 393 if c.Len() > 0 { 394 c.WriteByte('\n') 395 if empty > 0 { 396 c.WriteByte('\n') 397 } 398 } 399 empty = 0 400 401 c.WriteString(line) 402 } 403 return body.String(), appendix.String() 404} 405 406func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) { 407 msg, err := mail.ReadMessage(r) 408 if err != nil { 409 return nil, err 410 } 411 412 h := &PatchHeader{} 413 414 if strings.HasPrefix(mailLine, mailHeaderPrefix) { 415 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix) 416 if i := strings.IndexByte(mailLine, ' '); i > 0 { 417 h.SHA = mailLine[:i] 418 } 419 } 420 421 addrs, err := msg.Header.AddressList("From") 422 if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) { 423 return nil, err 424 } 425 if len(addrs) > 0 { 426 addr := addrs[0] 427 if addr.Name == "" { 428 addr.Name = addr.Address 429 } 430 h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address} 431 } 432 433 date := msg.Header.Get("Date") 434 if date != "" { 435 d, err := ParsePatchDate(date) 436 if err != nil { 437 return nil, err 438 } 439 h.AuthorDate = d 440 } 441 442 subject := msg.Header.Get("Subject") 443 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode) 444 445 s := bufio.NewScanner(msg.Body) 446 h.Body, h.BodyAppendix = scanMessageBody(s, "", true) 447 if s.Err() != nil { 448 return nil, s.Err() 449 } 450 451 return h, nil 452} 453 454func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) { 455 switch mode { 456 case SubjectCleanAll, SubjectCleanPatchOnly: 457 case SubjectCleanWhitespace: 458 return "", strings.TrimSpace(decodeSubject(s)) 459 default: 460 panic(fmt.Sprintf("unknown clean mode: %d", mode)) 461 } 462 463 // Based on the algorithm from Git in mailinfo.c:cleanup_subject() 464 // If compatibility with `git am` drifts, go there to see if there are any updates. 465 466 at := 0 467 for at < len(s) { 468 switch s[at] { 469 case 'r', 'R': 470 // Detect re:, Re:, rE: and RE: 471 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' { 472 at += 3 473 continue 474 } 475 476 case ' ', '\t', ':': 477 // Delete whitespace and duplicate ':' characters 478 at++ 479 continue 480 481 case '[': 482 if i := strings.IndexByte(s[at:], ']'); i > 0 { 483 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") { 484 at += i + 1 485 continue 486 } 487 } 488 } 489 490 // Nothing was removed, end processing 491 break 492 } 493 494 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace) 495 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace) 496 return 497} 498 499// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result 500// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji). 501// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject 502func decodeSubject(encoded string) string { 503 if !strings.HasPrefix(encoded, "=?UTF-8?q?") { 504 // not UTF-8 encoded 505 return encoded 506 } 507 508 // If the subject is too long, `git format-patch` may produce a subject line across 509 // multiple lines. When parsed, this can look like the following: 510 // <UTF8-prefix><first-line> <UTF8-prefix><second-line> 511 payload := " " + encoded 512 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "") 513 payload = strings.ReplaceAll(payload, "?=", "") 514 515 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload))) 516 if err != nil { 517 // if err, abort decoding and return original subject 518 return encoded 519 } 520 521 return string(decoded) 522}