fork of go-gitdiff with jj support
at v0.7.4 13 kB view raw
1package gitdiff 2 3import ( 4 "bufio" 5 "errors" 6 "fmt" 7 "io" 8 "io/ioutil" 9 "mime/quotedprintable" 10 "net/mail" 11 "strconv" 12 "strings" 13 "time" 14 "unicode" 15) 16 17const ( 18 mailHeaderPrefix = "From " 19 prettyHeaderPrefix = "commit " 20 mailMinimumHeaderPrefix = "From:" 21) 22 23// PatchHeader is a parsed version of the preamble content that appears before 24// the first diff in a patch. It includes metadata about the patch, such as the 25// author and a subject. 26type PatchHeader struct { 27 // The SHA of the commit the patch was generated from. Empty if the SHA is 28 // not included in the header. 29 SHA string 30 31 // The author details of the patch. If these details are not included in 32 // the header, Author is nil and AuthorDate is the zero time. 33 Author *PatchIdentity 34 AuthorDate time.Time 35 36 // The committer details of the patch. If these details are not included in 37 // the header, Committer is nil and CommitterDate is the zero time. 38 Committer *PatchIdentity 39 CommitterDate time.Time 40 41 // The title and body of the commit message describing the changes in the 42 // patch. Empty if no message is included in the header. 43 Title string 44 Body string 45 46 // If the preamble looks like an email, ParsePatchHeader will 47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the 48 // Title and place them here. 49 SubjectPrefix string 50 51 // If the preamble looks like an email, and it contains a `---` 52 // line, that line will be removed and everything after it will be 53 // placed in BodyAppendix. 54 BodyAppendix string 55} 56 57// Message returns the commit message for the header. The message consists of 58// the title and the body separated by an empty line. 59func (h *PatchHeader) Message() string { 60 var msg strings.Builder 61 if h != nil { 62 msg.WriteString(h.Title) 63 if h.Body != "" { 64 msg.WriteString("\n\n") 65 msg.WriteString(h.Body) 66 } 67 } 68 return msg.String() 69} 70 71// ParsePatchDate parses a patch date string. It returns the parsed time or an 72// error if s has an unknown format. ParsePatchDate supports the iso, rfc, 73// short, raw, unix, and default formats (with local variants) used by the 74// --date flag in Git. 75func ParsePatchDate(s string) (time.Time, error) { 76 const ( 77 isoFormat = "2006-01-02 15:04:05 -0700" 78 isoStrictFormat = "2006-01-02T15:04:05-07:00" 79 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700" 80 shortFormat = "2006-01-02" 81 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700" 82 defaultLocalFormat = "Mon Jan 2 15:04:05 2006" 83 ) 84 85 if s == "" { 86 return time.Time{}, nil 87 } 88 89 for _, fmt := range []string{ 90 isoFormat, 91 isoStrictFormat, 92 rfc2822Format, 93 shortFormat, 94 defaultFormat, 95 defaultLocalFormat, 96 } { 97 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil { 98 return t, nil 99 } 100 } 101 102 // unix format 103 if unix, err := strconv.ParseInt(s, 10, 64); err == nil { 104 return time.Unix(unix, 0), nil 105 } 106 107 // raw format 108 if space := strings.IndexByte(s, ' '); space > 0 { 109 unix, uerr := strconv.ParseInt(s[:space], 10, 64) 110 zone, zerr := time.Parse("-0700", s[space+1:]) 111 if uerr == nil && zerr == nil { 112 return time.Unix(unix, 0).In(zone.Location()), nil 113 } 114 } 115 116 return time.Time{}, fmt.Errorf("unknown date format: %s", s) 117} 118 119// A PatchHeaderOption modifies the behavior of ParsePatchHeader. 120type PatchHeaderOption func(*patchHeaderOptions) 121 122// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when 123// parsing mail-formatted patches. 124type SubjectCleanMode int 125 126const ( 127 // SubjectCleanWhitespace removes leading and trailing whitespace. 128 SubjectCleanWhitespace SubjectCleanMode = iota 129 130 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:", 131 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'. 132 // This is the default behavior of git (see `git mailinfo`) and this 133 // package. 134 SubjectCleanAll 135 136 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes 137 // leading strings enclosed by '[' and ']' if they start with "PATCH". 138 SubjectCleanPatchOnly 139) 140 141// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By 142// default, uses SubjectCleanAll. 143func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption { 144 return func(opts *patchHeaderOptions) { 145 opts.subjectCleanMode = m 146 } 147} 148 149type patchHeaderOptions struct { 150 subjectCleanMode SubjectCleanMode 151} 152 153// ParsePatchHeader parses the preamble string returned by [Parse] into a 154// PatchHeader. Due to the variety of header formats, some fields of the parsed 155// PatchHeader may be unset after parsing. 156// 157// Supported formats are the short, medium, full, fuller, and email pretty 158// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox 159// format used by `git format-patch`. 160// 161// When parsing mail-formatted headers, ParsePatchHeader tries to remove 162// email-specific content from the title and body: 163// 164// - Based on the SubjectCleanMode, remove prefixes like reply markers and 165// "[PATCH]" strings from the subject, saving any removed content in the 166// SubjectPrefix field. Parsing always discards leading and trailing 167// whitespace from the subject line. The default mode is SubjectCleanAll. 168// 169// - If the body contains a "---" line (3 hyphens), remove that line and any 170// content after it from the body and save it in the BodyAppendix field. 171// 172// ParsePatchHeader tries to process content it does not understand wthout 173// returning errors, but will return errors if well-identified content like 174// dates or identies uses unknown or invalid formats. 175func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) { 176 opts := patchHeaderOptions{ 177 subjectCleanMode: SubjectCleanAll, // match git defaults 178 } 179 for _, optFn := range options { 180 optFn(&opts) 181 } 182 183 header = strings.TrimSpace(header) 184 if header == "" { 185 return &PatchHeader{}, nil 186 } 187 188 var firstLine, rest string 189 if idx := strings.IndexByte(header, '\n'); idx >= 0 { 190 firstLine = header[:idx] 191 rest = header[idx+1:] 192 } else { 193 firstLine = header 194 rest = "" 195 } 196 197 switch { 198 case strings.HasPrefix(firstLine, mailHeaderPrefix): 199 return parseHeaderMail(firstLine, strings.NewReader(rest), opts) 200 201 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix): 202 // With a minimum header, the first line is part of the actual mail 203 // content and needs to be parsed as part of the "rest" 204 return parseHeaderMail("", strings.NewReader(header), opts) 205 206 case strings.HasPrefix(firstLine, prettyHeaderPrefix): 207 return parseHeaderPretty(firstLine, strings.NewReader(rest)) 208 } 209 210 return nil, errors.New("unrecognized patch header format") 211} 212 213func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) { 214 const ( 215 authorPrefix = "Author:" 216 commitPrefix = "Commit:" 217 datePrefix = "Date:" 218 authorDatePrefix = "AuthorDate:" 219 commitDatePrefix = "CommitDate:" 220 ) 221 222 h := &PatchHeader{} 223 224 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix) 225 if i := strings.IndexByte(prettyLine, ' '); i > 0 { 226 h.SHA = prettyLine[:i] 227 } else { 228 h.SHA = prettyLine 229 } 230 231 s := bufio.NewScanner(r) 232 for s.Scan() { 233 line := s.Text() 234 235 // empty line marks end of fields, remaining lines are title/message 236 if strings.TrimSpace(line) == "" { 237 break 238 } 239 240 switch { 241 case strings.HasPrefix(line, authorPrefix): 242 u, err := ParsePatchIdentity(line[len(authorPrefix):]) 243 if err != nil { 244 return nil, err 245 } 246 h.Author = &u 247 248 case strings.HasPrefix(line, commitPrefix): 249 u, err := ParsePatchIdentity(line[len(commitPrefix):]) 250 if err != nil { 251 return nil, err 252 } 253 h.Committer = &u 254 255 case strings.HasPrefix(line, datePrefix): 256 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):])) 257 if err != nil { 258 return nil, err 259 } 260 h.AuthorDate = d 261 262 case strings.HasPrefix(line, authorDatePrefix): 263 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):])) 264 if err != nil { 265 return nil, err 266 } 267 h.AuthorDate = d 268 269 case strings.HasPrefix(line, commitDatePrefix): 270 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):])) 271 if err != nil { 272 return nil, err 273 } 274 h.CommitterDate = d 275 } 276 } 277 if s.Err() != nil { 278 return nil, s.Err() 279 } 280 281 title, indent := scanMessageTitle(s) 282 if s.Err() != nil { 283 return nil, s.Err() 284 } 285 h.Title = title 286 287 if title != "" { 288 // Don't check for an appendix, pretty headers do not contain them 289 body, _ := scanMessageBody(s, indent, false) 290 if s.Err() != nil { 291 return nil, s.Err() 292 } 293 h.Body = body 294 } 295 296 return h, nil 297} 298 299func scanMessageTitle(s *bufio.Scanner) (title string, indent string) { 300 var b strings.Builder 301 for i := 0; s.Scan(); i++ { 302 line := s.Text() 303 trimLine := strings.TrimSpace(line) 304 if trimLine == "" { 305 break 306 } 307 308 if i == 0 { 309 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 { 310 indent = line[:start] 311 } 312 } 313 if b.Len() > 0 { 314 b.WriteByte(' ') 315 } 316 b.WriteString(trimLine) 317 } 318 return b.String(), indent 319} 320 321func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) { 322 // Body and appendix 323 var body, appendix strings.Builder 324 c := &body 325 var empty int 326 for i := 0; s.Scan(); i++ { 327 line := s.Text() 328 329 line = strings.TrimRightFunc(line, unicode.IsSpace) 330 line = strings.TrimPrefix(line, indent) 331 332 if line == "" { 333 empty++ 334 continue 335 } 336 337 // If requested, parse out "appendix" information (often added 338 // by `git format-patch` and removed by `git am`). 339 if separateAppendix && c == &body && line == "---" { 340 c = &appendix 341 continue 342 } 343 344 if c.Len() > 0 { 345 c.WriteByte('\n') 346 if empty > 0 { 347 c.WriteByte('\n') 348 } 349 } 350 empty = 0 351 352 c.WriteString(line) 353 } 354 return body.String(), appendix.String() 355} 356 357func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) { 358 msg, err := mail.ReadMessage(r) 359 if err != nil { 360 return nil, err 361 } 362 363 h := &PatchHeader{} 364 365 if strings.HasPrefix(mailLine, mailHeaderPrefix) { 366 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix) 367 if i := strings.IndexByte(mailLine, ' '); i > 0 { 368 h.SHA = mailLine[:i] 369 } 370 } 371 372 from := msg.Header.Get("From") 373 if from != "" { 374 u, err := ParsePatchIdentity(from) 375 if err != nil { 376 return nil, err 377 } 378 h.Author = &u 379 } 380 381 date := msg.Header.Get("Date") 382 if date != "" { 383 d, err := ParsePatchDate(date) 384 if err != nil { 385 return nil, err 386 } 387 h.AuthorDate = d 388 } 389 390 subject := msg.Header.Get("Subject") 391 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode) 392 393 s := bufio.NewScanner(msg.Body) 394 h.Body, h.BodyAppendix = scanMessageBody(s, "", true) 395 if s.Err() != nil { 396 return nil, s.Err() 397 } 398 399 return h, nil 400} 401 402func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) { 403 switch mode { 404 case SubjectCleanAll, SubjectCleanPatchOnly: 405 case SubjectCleanWhitespace: 406 return "", strings.TrimSpace(decodeSubject(s)) 407 default: 408 panic(fmt.Sprintf("unknown clean mode: %d", mode)) 409 } 410 411 // Based on the algorithm from Git in mailinfo.c:cleanup_subject() 412 // If compatibility with `git am` drifts, go there to see if there are any updates. 413 414 at := 0 415 for at < len(s) { 416 switch s[at] { 417 case 'r', 'R': 418 // Detect re:, Re:, rE: and RE: 419 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' { 420 at += 3 421 continue 422 } 423 424 case ' ', '\t', ':': 425 // Delete whitespace and duplicate ':' characters 426 at++ 427 continue 428 429 case '[': 430 if i := strings.IndexByte(s[at:], ']'); i > 0 { 431 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") { 432 at += i + 1 433 continue 434 } 435 } 436 } 437 438 // Nothing was removed, end processing 439 break 440 } 441 442 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace) 443 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace) 444 return 445} 446 447// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result 448// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji). 449// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject 450func decodeSubject(encoded string) string { 451 if !strings.HasPrefix(encoded, "=?UTF-8?q?") { 452 // not UTF-8 encoded 453 return encoded 454 } 455 456 // If the subject is too long, `git format-patch` may produce a subject line across 457 // multiple lines. When parsed, this can look like the following: 458 // <UTF8-prefix><first-line> <UTF8-prefix><second-line> 459 payload := " " + encoded 460 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "") 461 payload = strings.ReplaceAll(payload, "?=", "") 462 463 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload))) 464 if err != nil { 465 // if err, abort decoding and return original subject 466 return encoded 467 } 468 469 return string(decoded) 470}