fork of go-gitdiff with jj support
1package gitdiff
2
3import (
4 "bufio"
5 "errors"
6 "fmt"
7 "io"
8 "io/ioutil"
9 "mime/quotedprintable"
10 "net/mail"
11 "strconv"
12 "strings"
13 "time"
14 "unicode"
15)
16
17const (
18 mailHeaderPrefix = "From "
19 prettyHeaderPrefix = "commit "
20 mailMinimumHeaderPrefix = "From:"
21)
22
23// PatchHeader is a parsed version of the preamble content that appears before
24// the first diff in a patch. It includes metadata about the patch, such as the
25// author and a subject.
26type PatchHeader struct {
27 // The SHA of the commit the patch was generated from. Empty if the SHA is
28 // not included in the header.
29 SHA string
30
31 // The author details of the patch. If these details are not included in
32 // the header, Author is nil and AuthorDate is the zero time.
33 Author *PatchIdentity
34 AuthorDate time.Time
35
36 // The committer details of the patch. If these details are not included in
37 // the header, Committer is nil and CommitterDate is the zero time.
38 Committer *PatchIdentity
39 CommitterDate time.Time
40
41 // The title and body of the commit message describing the changes in the
42 // patch. Empty if no message is included in the header.
43 Title string
44 Body string
45
46 // If the preamble looks like an email, ParsePatchHeader will
47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the
48 // Title and place them here.
49 SubjectPrefix string
50
51 // If the preamble looks like an email, and it contains a `---`
52 // line, that line will be removed and everything after it will be
53 // placed in BodyAppendix.
54 BodyAppendix string
55}
56
57// Message returns the commit message for the header. The message consists of
58// the title and the body separated by an empty line.
59func (h *PatchHeader) Message() string {
60 var msg strings.Builder
61 if h != nil {
62 msg.WriteString(h.Title)
63 if h.Body != "" {
64 msg.WriteString("\n\n")
65 msg.WriteString(h.Body)
66 }
67 }
68 return msg.String()
69}
70
71// PatchIdentity identifies a person who authored or committed a patch.
72type PatchIdentity struct {
73 Name string
74 Email string
75}
76
77func (i PatchIdentity) String() string {
78 name := i.Name
79 if name == "" {
80 name = `""`
81 }
82 return fmt.Sprintf("%s <%s>", name, i.Email)
83}
84
85// ParsePatchIdentity parses a patch identity string. A valid string contains a
86// non-empty name followed by an email address in angle brackets. Like Git,
87// ParsePatchIdentity does not require that the email address is valid or
88// properly formatted, only that it is non-empty. The name must not contain a
89// left angle bracket, '<', and the email address must not contain a right
90// angle bracket, '>'.
91func ParsePatchIdentity(s string) (PatchIdentity, error) {
92 var emailStart, emailEnd int
93 for i, c := range s {
94 if c == '<' && emailStart == 0 {
95 emailStart = i + 1
96 }
97 if c == '>' && emailStart > 0 {
98 emailEnd = i
99 break
100 }
101 }
102 if emailStart > 0 && emailEnd == 0 {
103 return PatchIdentity{}, fmt.Errorf("invalid identity string: unclosed email section: %s", s)
104 }
105
106 var name, email string
107 if emailStart > 0 {
108 name = strings.TrimSpace(s[:emailStart-1])
109 }
110 if emailStart > 0 && emailEnd > 0 {
111 email = strings.TrimSpace(s[emailStart:emailEnd])
112 }
113 if name == "" || email == "" {
114 return PatchIdentity{}, fmt.Errorf("invalid identity string: %s", s)
115 }
116
117 return PatchIdentity{Name: name, Email: email}, nil
118}
119
120// ParsePatchDate parses a patch date string. It returns the parsed time or an
121// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
122// short, raw, unix, and default formats (with local variants) used by the
123// --date flag in Git.
124func ParsePatchDate(s string) (time.Time, error) {
125 const (
126 isoFormat = "2006-01-02 15:04:05 -0700"
127 isoStrictFormat = "2006-01-02T15:04:05-07:00"
128 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700"
129 shortFormat = "2006-01-02"
130 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700"
131 defaultLocalFormat = "Mon Jan 2 15:04:05 2006"
132 )
133
134 if s == "" {
135 return time.Time{}, nil
136 }
137
138 for _, fmt := range []string{
139 isoFormat,
140 isoStrictFormat,
141 rfc2822Format,
142 shortFormat,
143 defaultFormat,
144 defaultLocalFormat,
145 } {
146 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil {
147 return t, nil
148 }
149 }
150
151 // unix format
152 if unix, err := strconv.ParseInt(s, 10, 64); err == nil {
153 return time.Unix(unix, 0), nil
154 }
155
156 // raw format
157 if space := strings.IndexByte(s, ' '); space > 0 {
158 unix, uerr := strconv.ParseInt(s[:space], 10, 64)
159 zone, zerr := time.Parse("-0700", s[space+1:])
160 if uerr == nil && zerr == nil {
161 return time.Unix(unix, 0).In(zone.Location()), nil
162 }
163 }
164
165 return time.Time{}, fmt.Errorf("unknown date format: %s", s)
166}
167
168// A PatchHeaderOption modifies the behavior of ParsePatchHeader.
169type PatchHeaderOption func(*patchHeaderOptions)
170
171// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when
172// parsing mail-formatted patches.
173type SubjectCleanMode int
174
175const (
176 // SubjectCleanWhitespace removes leading and trailing whitespace.
177 SubjectCleanWhitespace SubjectCleanMode = iota
178
179 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:",
180 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'.
181 // This is the default behavior of git (see `git mailinfo`) and this
182 // package.
183 SubjectCleanAll
184
185 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes
186 // leading strings enclosed by '[' and ']' if they start with "PATCH".
187 SubjectCleanPatchOnly
188)
189
190// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By
191// default, uses SubjectCleanAll.
192func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption {
193 return func(opts *patchHeaderOptions) {
194 opts.subjectCleanMode = m
195 }
196}
197
198type patchHeaderOptions struct {
199 subjectCleanMode SubjectCleanMode
200}
201
202// ParsePatchHeader parses the preamble string returned by [Parse] into a
203// PatchHeader. Due to the variety of header formats, some fields of the parsed
204// PatchHeader may be unset after parsing.
205//
206// Supported formats are the short, medium, full, fuller, and email pretty
207// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox
208// format used by `git format-patch`.
209//
210// When parsing mail-formatted headers, ParsePatchHeader tries to remove
211// email-specific content from the title and body:
212//
213// - Based on the SubjectCleanMode, remove prefixes like reply markers and
214// "[PATCH]" strings from the subject, saving any removed content in the
215// SubjectPrefix field. Parsing always discards leading and trailing
216// whitespace from the subject line. The default mode is SubjectCleanAll.
217//
218// - If the body contains a "---" line (3 hyphens), remove that line and any
219// content after it from the body and save it in the BodyAppendix field.
220//
221// ParsePatchHeader tries to process content it does not understand wthout
222// returning errors, but will return errors if well-identified content like
223// dates or identies uses unknown or invalid formats.
224func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) {
225 opts := patchHeaderOptions{
226 subjectCleanMode: SubjectCleanAll, // match git defaults
227 }
228 for _, optFn := range options {
229 optFn(&opts)
230 }
231
232 header = strings.TrimSpace(header)
233 if header == "" {
234 return &PatchHeader{}, nil
235 }
236
237 var firstLine, rest string
238 if idx := strings.IndexByte(header, '\n'); idx >= 0 {
239 firstLine = header[:idx]
240 rest = header[idx+1:]
241 } else {
242 firstLine = header
243 rest = ""
244 }
245
246 switch {
247 case strings.HasPrefix(firstLine, mailHeaderPrefix):
248 return parseHeaderMail(firstLine, strings.NewReader(rest), opts)
249
250 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix):
251 // With a minimum header, the first line is part of the actual mail
252 // content and needs to be parsed as part of the "rest"
253 return parseHeaderMail("", strings.NewReader(header), opts)
254
255 case strings.HasPrefix(firstLine, prettyHeaderPrefix):
256 return parseHeaderPretty(firstLine, strings.NewReader(rest))
257 }
258
259 return nil, errors.New("unrecognized patch header format")
260}
261
262func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) {
263 const (
264 authorPrefix = "Author:"
265 commitPrefix = "Commit:"
266 datePrefix = "Date:"
267 authorDatePrefix = "AuthorDate:"
268 commitDatePrefix = "CommitDate:"
269 )
270
271 h := &PatchHeader{}
272
273 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix)
274 if i := strings.IndexByte(prettyLine, ' '); i > 0 {
275 h.SHA = prettyLine[:i]
276 } else {
277 h.SHA = prettyLine
278 }
279
280 s := bufio.NewScanner(r)
281 for s.Scan() {
282 line := s.Text()
283
284 // empty line marks end of fields, remaining lines are title/message
285 if strings.TrimSpace(line) == "" {
286 break
287 }
288
289 switch {
290 case strings.HasPrefix(line, authorPrefix):
291 u, err := ParsePatchIdentity(line[len(authorPrefix):])
292 if err != nil {
293 return nil, err
294 }
295 h.Author = &u
296
297 case strings.HasPrefix(line, commitPrefix):
298 u, err := ParsePatchIdentity(line[len(commitPrefix):])
299 if err != nil {
300 return nil, err
301 }
302 h.Committer = &u
303
304 case strings.HasPrefix(line, datePrefix):
305 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):]))
306 if err != nil {
307 return nil, err
308 }
309 h.AuthorDate = d
310
311 case strings.HasPrefix(line, authorDatePrefix):
312 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):]))
313 if err != nil {
314 return nil, err
315 }
316 h.AuthorDate = d
317
318 case strings.HasPrefix(line, commitDatePrefix):
319 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):]))
320 if err != nil {
321 return nil, err
322 }
323 h.CommitterDate = d
324 }
325 }
326 if s.Err() != nil {
327 return nil, s.Err()
328 }
329
330 title, indent := scanMessageTitle(s)
331 if s.Err() != nil {
332 return nil, s.Err()
333 }
334 h.Title = title
335
336 if title != "" {
337 // Don't check for an appendix, pretty headers do not contain them
338 body, _ := scanMessageBody(s, indent, false)
339 if s.Err() != nil {
340 return nil, s.Err()
341 }
342 h.Body = body
343 }
344
345 return h, nil
346}
347
348func scanMessageTitle(s *bufio.Scanner) (title string, indent string) {
349 var b strings.Builder
350 for i := 0; s.Scan(); i++ {
351 line := s.Text()
352 trimLine := strings.TrimSpace(line)
353 if trimLine == "" {
354 break
355 }
356
357 if i == 0 {
358 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 {
359 indent = line[:start]
360 }
361 }
362 if b.Len() > 0 {
363 b.WriteByte(' ')
364 }
365 b.WriteString(trimLine)
366 }
367 return b.String(), indent
368}
369
370func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) {
371 // Body and appendix
372 var body, appendix strings.Builder
373 c := &body
374 var empty int
375 for i := 0; s.Scan(); i++ {
376 line := s.Text()
377
378 line = strings.TrimRightFunc(line, unicode.IsSpace)
379 line = strings.TrimPrefix(line, indent)
380
381 if line == "" {
382 empty++
383 continue
384 }
385
386 // If requested, parse out "appendix" information (often added
387 // by `git format-patch` and removed by `git am`).
388 if separateAppendix && c == &body && line == "---" {
389 c = &appendix
390 continue
391 }
392
393 if c.Len() > 0 {
394 c.WriteByte('\n')
395 if empty > 0 {
396 c.WriteByte('\n')
397 }
398 }
399 empty = 0
400
401 c.WriteString(line)
402 }
403 return body.String(), appendix.String()
404}
405
406func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) {
407 msg, err := mail.ReadMessage(r)
408 if err != nil {
409 return nil, err
410 }
411
412 h := &PatchHeader{}
413
414 if strings.HasPrefix(mailLine, mailHeaderPrefix) {
415 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix)
416 if i := strings.IndexByte(mailLine, ' '); i > 0 {
417 h.SHA = mailLine[:i]
418 }
419 }
420
421 addrs, err := msg.Header.AddressList("From")
422 if err != nil && !errors.Is(err, mail.ErrHeaderNotPresent) {
423 return nil, err
424 }
425 if len(addrs) > 0 {
426 addr := addrs[0]
427 if addr.Name == "" {
428 addr.Name = addr.Address
429 }
430 h.Author = &PatchIdentity{Name: addr.Name, Email: addr.Address}
431 }
432
433 date := msg.Header.Get("Date")
434 if date != "" {
435 d, err := ParsePatchDate(date)
436 if err != nil {
437 return nil, err
438 }
439 h.AuthorDate = d
440 }
441
442 subject := msg.Header.Get("Subject")
443 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode)
444
445 s := bufio.NewScanner(msg.Body)
446 h.Body, h.BodyAppendix = scanMessageBody(s, "", true)
447 if s.Err() != nil {
448 return nil, s.Err()
449 }
450
451 return h, nil
452}
453
454func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) {
455 switch mode {
456 case SubjectCleanAll, SubjectCleanPatchOnly:
457 case SubjectCleanWhitespace:
458 return "", strings.TrimSpace(decodeSubject(s))
459 default:
460 panic(fmt.Sprintf("unknown clean mode: %d", mode))
461 }
462
463 // Based on the algorithm from Git in mailinfo.c:cleanup_subject()
464 // If compatibility with `git am` drifts, go there to see if there are any updates.
465
466 at := 0
467 for at < len(s) {
468 switch s[at] {
469 case 'r', 'R':
470 // Detect re:, Re:, rE: and RE:
471 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' {
472 at += 3
473 continue
474 }
475
476 case ' ', '\t', ':':
477 // Delete whitespace and duplicate ':' characters
478 at++
479 continue
480
481 case '[':
482 if i := strings.IndexByte(s[at:], ']'); i > 0 {
483 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") {
484 at += i + 1
485 continue
486 }
487 }
488 }
489
490 // Nothing was removed, end processing
491 break
492 }
493
494 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace)
495 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace)
496 return
497}
498
499// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result
500// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji).
501// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject
502func decodeSubject(encoded string) string {
503 if !strings.HasPrefix(encoded, "=?UTF-8?q?") {
504 // not UTF-8 encoded
505 return encoded
506 }
507
508 // If the subject is too long, `git format-patch` may produce a subject line across
509 // multiple lines. When parsed, this can look like the following:
510 // <UTF8-prefix><first-line> <UTF8-prefix><second-line>
511 payload := " " + encoded
512 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "")
513 payload = strings.ReplaceAll(payload, "?=", "")
514
515 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload)))
516 if err != nil {
517 // if err, abort decoding and return original subject
518 return encoded
519 }
520
521 return string(decoded)
522}