fork of go-gitdiff with jj support
1package gitdiff
2
3import (
4 "bufio"
5 "errors"
6 "fmt"
7 "io"
8 "io/ioutil"
9 "mime/quotedprintable"
10 "net/mail"
11 "strconv"
12 "strings"
13 "time"
14 "unicode"
15)
16
17const (
18 mailHeaderPrefix = "From "
19 prettyHeaderPrefix = "commit "
20 mailMinimumHeaderPrefix = "From:"
21)
22
23// PatchHeader is a parsed version of the preamble content that appears before
24// the first diff in a patch. It includes metadata about the patch, such as the
25// author and a subject.
26type PatchHeader struct {
27 // The SHA of the commit the patch was generated from. Empty if the SHA is
28 // not included in the header.
29 SHA string
30
31 // The author details of the patch. If these details are not included in
32 // the header, Author is nil and AuthorDate is the zero time.
33 Author *PatchIdentity
34 AuthorDate time.Time
35
36 // The committer details of the patch. If these details are not included in
37 // the header, Committer is nil and CommitterDate is the zero time.
38 Committer *PatchIdentity
39 CommitterDate time.Time
40
41 // The title and body of the commit message describing the changes in the
42 // patch. Empty if no message is included in the header.
43 Title string
44 Body string
45
46 // If the preamble looks like an email, ParsePatchHeader will
47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the
48 // Title and place them here.
49 SubjectPrefix string
50
51 // If the preamble looks like an email, and it contains a `---`
52 // line, that line will be removed and everything after it will be
53 // placed in BodyAppendix.
54 BodyAppendix string
55}
56
57// Message returns the commit message for the header. The message consists of
58// the title and the body separated by an empty line.
59func (h *PatchHeader) Message() string {
60 var msg strings.Builder
61 if h != nil {
62 msg.WriteString(h.Title)
63 if h.Body != "" {
64 msg.WriteString("\n\n")
65 msg.WriteString(h.Body)
66 }
67 }
68 return msg.String()
69}
70
71// PatchIdentity identifies a person who authored or committed a patch.
72type PatchIdentity struct {
73 Name string
74 Email string
75}
76
77func (i PatchIdentity) String() string {
78 name := i.Name
79 if name == "" {
80 name = `""`
81 }
82 return fmt.Sprintf("%s <%s>", name, i.Email)
83}
84
85// ParsePatchIdentity parses a patch identity string. A valid string contains
86// an optional name followed by an email address in angle brackets. The angle
87// brackets must always exist, but may enclose an empty address. At least one
88// of the name or the email address must be non-empty. If the string only
89// contains an email address, that value is also used as the name.
90//
91// The name must not contain a left angle bracket, '<', and the email address
92// must not contain a right angle bracket, '>'. Otherwise, there are no
93// restrictions on the format of either field.
94func ParsePatchIdentity(s string) (PatchIdentity, error) {
95 var emailStart, emailEnd int
96 for i, c := range s {
97 if c == '<' && emailStart == 0 {
98 emailStart = i + 1
99 }
100 if c == '>' && emailStart > 0 {
101 emailEnd = i
102 break
103 }
104 }
105 if emailStart > 0 && emailEnd == 0 {
106 return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s)
107 }
108
109 var name, email string
110 if emailStart > 0 {
111 name = strings.TrimSpace(s[:emailStart-1])
112 }
113 if emailStart > 0 && emailEnd > 0 {
114 email = strings.TrimSpace(s[emailStart:emailEnd])
115 }
116 if name == "" && email != "" {
117 name = email
118 }
119
120 if name == "" && email == "" {
121 return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s)
122 }
123
124 return PatchIdentity{Name: name, Email: email}, nil
125}
126
127// ParsePatchDate parses a patch date string. It returns the parsed time or an
128// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
129// short, raw, unix, and default formats (with local variants) used by the
130// --date flag in Git.
131func ParsePatchDate(s string) (time.Time, error) {
132 const (
133 isoFormat = "2006-01-02 15:04:05 -0700"
134 isoStrictFormat = "2006-01-02T15:04:05-07:00"
135 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700"
136 shortFormat = "2006-01-02"
137 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700"
138 defaultLocalFormat = "Mon Jan 2 15:04:05 2006"
139 )
140
141 if s == "" {
142 return time.Time{}, nil
143 }
144
145 for _, fmt := range []string{
146 isoFormat,
147 isoStrictFormat,
148 rfc2822Format,
149 shortFormat,
150 defaultFormat,
151 defaultLocalFormat,
152 } {
153 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil {
154 return t, nil
155 }
156 }
157
158 // unix format
159 if unix, err := strconv.ParseInt(s, 10, 64); err == nil {
160 return time.Unix(unix, 0), nil
161 }
162
163 // raw format
164 if space := strings.IndexByte(s, ' '); space > 0 {
165 unix, uerr := strconv.ParseInt(s[:space], 10, 64)
166 zone, zerr := time.Parse("-0700", s[space+1:])
167 if uerr == nil && zerr == nil {
168 return time.Unix(unix, 0).In(zone.Location()), nil
169 }
170 }
171
172 return time.Time{}, fmt.Errorf("unknown date format: %s", s)
173}
174
175// A PatchHeaderOption modifies the behavior of ParsePatchHeader.
176type PatchHeaderOption func(*patchHeaderOptions)
177
178// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when
179// parsing mail-formatted patches.
180type SubjectCleanMode int
181
182const (
183 // SubjectCleanWhitespace removes leading and trailing whitespace.
184 SubjectCleanWhitespace SubjectCleanMode = iota
185
186 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:",
187 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'.
188 // This is the default behavior of git (see `git mailinfo`) and this
189 // package.
190 SubjectCleanAll
191
192 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes
193 // leading strings enclosed by '[' and ']' if they start with "PATCH".
194 SubjectCleanPatchOnly
195)
196
197// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By
198// default, uses SubjectCleanAll.
199func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption {
200 return func(opts *patchHeaderOptions) {
201 opts.subjectCleanMode = m
202 }
203}
204
205type patchHeaderOptions struct {
206 subjectCleanMode SubjectCleanMode
207}
208
209// ParsePatchHeader parses the preamble string returned by [Parse] into a
210// PatchHeader. Due to the variety of header formats, some fields of the parsed
211// PatchHeader may be unset after parsing.
212//
213// Supported formats are the short, medium, full, fuller, and email pretty
214// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox
215// format used by `git format-patch`.
216//
217// When parsing mail-formatted headers, ParsePatchHeader tries to remove
218// email-specific content from the title and body:
219//
220// - Based on the SubjectCleanMode, remove prefixes like reply markers and
221// "[PATCH]" strings from the subject, saving any removed content in the
222// SubjectPrefix field. Parsing always discards leading and trailing
223// whitespace from the subject line. The default mode is SubjectCleanAll.
224//
225// - If the body contains a "---" line (3 hyphens), remove that line and any
226// content after it from the body and save it in the BodyAppendix field.
227//
228// ParsePatchHeader tries to process content it does not understand wthout
229// returning errors, but will return errors if well-identified content like
230// dates or identies uses unknown or invalid formats.
231func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) {
232 opts := patchHeaderOptions{
233 subjectCleanMode: SubjectCleanAll, // match git defaults
234 }
235 for _, optFn := range options {
236 optFn(&opts)
237 }
238
239 header = strings.TrimSpace(header)
240 if header == "" {
241 return &PatchHeader{}, nil
242 }
243
244 var firstLine, rest string
245 if idx := strings.IndexByte(header, '\n'); idx >= 0 {
246 firstLine = header[:idx]
247 rest = header[idx+1:]
248 } else {
249 firstLine = header
250 rest = ""
251 }
252
253 switch {
254 case strings.HasPrefix(firstLine, mailHeaderPrefix):
255 return parseHeaderMail(firstLine, strings.NewReader(rest), opts)
256
257 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix):
258 // With a minimum header, the first line is part of the actual mail
259 // content and needs to be parsed as part of the "rest"
260 return parseHeaderMail("", strings.NewReader(header), opts)
261
262 case strings.HasPrefix(firstLine, prettyHeaderPrefix):
263 return parseHeaderPretty(firstLine, strings.NewReader(rest))
264 }
265
266 return nil, errors.New("unrecognized patch header format")
267}
268
269func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) {
270 const (
271 authorPrefix = "Author:"
272 commitPrefix = "Commit:"
273 datePrefix = "Date:"
274 authorDatePrefix = "AuthorDate:"
275 commitDatePrefix = "CommitDate:"
276 )
277
278 h := &PatchHeader{}
279
280 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix)
281 if i := strings.IndexByte(prettyLine, ' '); i > 0 {
282 h.SHA = prettyLine[:i]
283 } else {
284 h.SHA = prettyLine
285 }
286
287 s := bufio.NewScanner(r)
288 for s.Scan() {
289 line := s.Text()
290
291 // empty line marks end of fields, remaining lines are title/message
292 if strings.TrimSpace(line) == "" {
293 break
294 }
295
296 switch {
297 case strings.HasPrefix(line, authorPrefix):
298 u, err := ParsePatchIdentity(line[len(authorPrefix):])
299 if err != nil {
300 return nil, err
301 }
302 h.Author = &u
303
304 case strings.HasPrefix(line, commitPrefix):
305 u, err := ParsePatchIdentity(line[len(commitPrefix):])
306 if err != nil {
307 return nil, err
308 }
309 h.Committer = &u
310
311 case strings.HasPrefix(line, datePrefix):
312 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):]))
313 if err != nil {
314 return nil, err
315 }
316 h.AuthorDate = d
317
318 case strings.HasPrefix(line, authorDatePrefix):
319 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):]))
320 if err != nil {
321 return nil, err
322 }
323 h.AuthorDate = d
324
325 case strings.HasPrefix(line, commitDatePrefix):
326 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):]))
327 if err != nil {
328 return nil, err
329 }
330 h.CommitterDate = d
331 }
332 }
333 if s.Err() != nil {
334 return nil, s.Err()
335 }
336
337 title, indent := scanMessageTitle(s)
338 if s.Err() != nil {
339 return nil, s.Err()
340 }
341 h.Title = title
342
343 if title != "" {
344 // Don't check for an appendix, pretty headers do not contain them
345 body, _ := scanMessageBody(s, indent, false)
346 if s.Err() != nil {
347 return nil, s.Err()
348 }
349 h.Body = body
350 }
351
352 return h, nil
353}
354
355func scanMessageTitle(s *bufio.Scanner) (title string, indent string) {
356 var b strings.Builder
357 for i := 0; s.Scan(); i++ {
358 line := s.Text()
359 trimLine := strings.TrimSpace(line)
360 if trimLine == "" {
361 break
362 }
363
364 if i == 0 {
365 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 {
366 indent = line[:start]
367 }
368 }
369 if b.Len() > 0 {
370 b.WriteByte(' ')
371 }
372 b.WriteString(trimLine)
373 }
374 return b.String(), indent
375}
376
377func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) {
378 // Body and appendix
379 var body, appendix strings.Builder
380 c := &body
381 var empty int
382 for i := 0; s.Scan(); i++ {
383 line := s.Text()
384
385 line = strings.TrimRightFunc(line, unicode.IsSpace)
386 line = strings.TrimPrefix(line, indent)
387
388 if line == "" {
389 empty++
390 continue
391 }
392
393 // If requested, parse out "appendix" information (often added
394 // by `git format-patch` and removed by `git am`).
395 if separateAppendix && c == &body && line == "---" {
396 c = &appendix
397 continue
398 }
399
400 if c.Len() > 0 {
401 c.WriteByte('\n')
402 if empty > 0 {
403 c.WriteByte('\n')
404 }
405 }
406 empty = 0
407
408 c.WriteString(line)
409 }
410 return body.String(), appendix.String()
411}
412
413func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) {
414 msg, err := mail.ReadMessage(r)
415 if err != nil {
416 return nil, err
417 }
418
419 h := &PatchHeader{}
420
421 if strings.HasPrefix(mailLine, mailHeaderPrefix) {
422 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix)
423 if i := strings.IndexByte(mailLine, ' '); i > 0 {
424 h.SHA = mailLine[:i]
425 }
426 }
427
428 addrs, err := msg.Header.AddressList("From")
429 if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) {
430 return nil, err
431 }
432 if len(addrs) > 0 {
433 addr := addrs[0]
434 if addr.Name == "" {
435 addr.Name = addr.Address
436 }
437 h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address}
438 }
439
440 date := msg.Header.Get("Date")
441 if date != "" {
442 d, err := ParsePatchDate(date)
443 if err != nil {
444 return nil, err
445 }
446 h.AuthorDate = d
447 }
448
449 subject := msg.Header.Get("Subject")
450 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode)
451
452 s := bufio.NewScanner(msg.Body)
453 h.Body, h.BodyAppendix = scanMessageBody(s, "", true)
454 if s.Err() != nil {
455 return nil, s.Err()
456 }
457
458 return h, nil
459}
460
461func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) {
462 switch mode {
463 case SubjectCleanAll, SubjectCleanPatchOnly:
464 case SubjectCleanWhitespace:
465 return "", strings.TrimSpace(decodeSubject(s))
466 default:
467 panic(fmt.Sprintf("unknown clean mode: %d", mode))
468 }
469
470 // Based on the algorithm from Git in mailinfo.c:cleanup_subject()
471 // If compatibility with `git am` drifts, go there to see if there are any updates.
472
473 at := 0
474 for at < len(s) {
475 switch s[at] {
476 case 'r', 'R':
477 // Detect re:, Re:, rE: and RE:
478 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' {
479 at += 3
480 continue
481 }
482
483 case ' ', '\t', ':':
484 // Delete whitespace and duplicate ':' characters
485 at++
486 continue
487
488 case '[':
489 if i := strings.IndexByte(s[at:], ']'); i > 0 {
490 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") {
491 at += i + 1
492 continue
493 }
494 }
495 }
496
497 // Nothing was removed, end processing
498 break
499 }
500
501 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace)
502 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace)
503 return
504}
505
506// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result
507// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji).
508// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject
509func decodeSubject(encoded string) string {
510 if !strings.HasPrefix(encoded, "=?UTF-8?q?") {
511 // not UTF-8 encoded
512 return encoded
513 }
514
515 // If the subject is too long, `git format-patch` may produce a subject line across
516 // multiple lines. When parsed, this can look like the following:
517 // <UTF8-prefix><first-line> <UTF8-prefix><second-line>
518 payload := " " + encoded
519 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "")
520 payload = strings.ReplaceAll(payload, "?=", "")
521
522 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload)))
523 if err != nil {
524 // if err, abort decoding and return original subject
525 return encoded
526 }
527
528 return string(decoded)
529}