fork of go-gitdiff with jj support
1package gitdiff
2
3import (
4 "bufio"
5 "errors"
6 "fmt"
7 "io"
8 "io/ioutil"
9 "mime/quotedprintable"
10 "net/mail"
11 "strconv"
12 "strings"
13 "time"
14 "unicode"
15)
16
17const (
18 mailHeaderPrefix = "From "
19 prettyHeaderPrefix = "commit "
20 mailMinimumHeaderPrefix = "From:"
21)
22
23// PatchHeader is a parsed version of the preamble content that appears before
24// the first diff in a patch. It includes metadata about the patch, such as the
25// author and a subject.
26type PatchHeader struct {
27 // The SHA of the commit the patch was generated from. Empty if the SHA is
28 // not included in the header.
29 SHA string
30
31 // The author details of the patch. If these details are not included in
32 // the header, Author is nil and AuthorDate is the zero time.
33 Author *PatchIdentity
34 AuthorDate time.Time
35
36 // The committer details of the patch. If these details are not included in
37 // the header, Committer is nil and CommitterDate is the zero time.
38 Committer *PatchIdentity
39 CommitterDate time.Time
40
41 // The title and body of the commit message describing the changes in the
42 // patch. Empty if no message is included in the header.
43 Title string
44 Body string
45
46 // If the preamble looks like an email, ParsePatchHeader will
47 // remove prefixes such as `Re: ` and `[PATCH v3 5/17]` from the
48 // Title and place them here.
49 SubjectPrefix string
50
51 // If the preamble looks like an email, and it contains a `---`
52 // line, that line will be removed and everything after it will be
53 // placed in BodyAppendix.
54 BodyAppendix string
55
56 // All headers completely unparsed
57 RawHeaders map[string][]string
58}
59
60// Message returns the commit message for the header. The message consists of
61// the title and the body separated by an empty line.
62func (h *PatchHeader) Message() string {
63 var msg strings.Builder
64 if h != nil {
65 msg.WriteString(h.Title)
66 if h.Body != "" {
67 msg.WriteString("\n\n")
68 msg.WriteString(h.Body)
69 }
70 }
71 return msg.String()
72}
73
74// ParsePatchDate parses a patch date string. It returns the parsed time or an
75// error if s has an unknown format. ParsePatchDate supports the iso, rfc,
76// short, raw, unix, and default formats (with local variants) used by the
77// --date flag in Git.
78func ParsePatchDate(s string) (time.Time, error) {
79 const (
80 isoFormat = "2006-01-02 15:04:05 -0700"
81 isoStrictFormat = "2006-01-02T15:04:05-07:00"
82 rfc2822Format = "Mon, 2 Jan 2006 15:04:05 -0700"
83 shortFormat = "2006-01-02"
84 defaultFormat = "Mon Jan 2 15:04:05 2006 -0700"
85 defaultLocalFormat = "Mon Jan 2 15:04:05 2006"
86 )
87
88 if s == "" {
89 return time.Time{}, nil
90 }
91
92 for _, fmt := range []string{
93 isoFormat,
94 isoStrictFormat,
95 rfc2822Format,
96 shortFormat,
97 defaultFormat,
98 defaultLocalFormat,
99 } {
100 if t, err := time.ParseInLocation(fmt, s, time.Local); err == nil {
101 return t, nil
102 }
103 }
104
105 // unix format
106 if unix, err := strconv.ParseInt(s, 10, 64); err == nil {
107 return time.Unix(unix, 0), nil
108 }
109
110 // raw format
111 if space := strings.IndexByte(s, ' '); space > 0 {
112 unix, uerr := strconv.ParseInt(s[:space], 10, 64)
113 zone, zerr := time.Parse("-0700", s[space+1:])
114 if uerr == nil && zerr == nil {
115 return time.Unix(unix, 0).In(zone.Location()), nil
116 }
117 }
118
119 return time.Time{}, fmt.Errorf("unknown date format: %s", s)
120}
121
122// A PatchHeaderOption modifies the behavior of ParsePatchHeader.
123type PatchHeaderOption func(*patchHeaderOptions)
124
125// SubjectCleanMode controls how ParsePatchHeader cleans subject lines when
126// parsing mail-formatted patches.
127type SubjectCleanMode int
128
129const (
130 // SubjectCleanWhitespace removes leading and trailing whitespace.
131 SubjectCleanWhitespace SubjectCleanMode = iota
132
133 // SubjectCleanAll removes leading and trailing whitespace, leading "Re:",
134 // "re:", and ":" strings, and leading strings enclosed by '[' and ']'.
135 // This is the default behavior of git (see `git mailinfo`) and this
136 // package.
137 SubjectCleanAll
138
139 // SubjectCleanPatchOnly is the same as SubjectCleanAll, but only removes
140 // leading strings enclosed by '[' and ']' if they start with "PATCH".
141 SubjectCleanPatchOnly
142)
143
144// WithSubjectCleanMode sets the SubjectCleanMode for header parsing. By
145// default, uses SubjectCleanAll.
146func WithSubjectCleanMode(m SubjectCleanMode) PatchHeaderOption {
147 return func(opts *patchHeaderOptions) {
148 opts.subjectCleanMode = m
149 }
150}
151
152type patchHeaderOptions struct {
153 subjectCleanMode SubjectCleanMode
154}
155
156// ParsePatchHeader parses the preamble string returned by [Parse] into a
157// PatchHeader. Due to the variety of header formats, some fields of the parsed
158// PatchHeader may be unset after parsing.
159//
160// Supported formats are the short, medium, full, fuller, and email pretty
161// formats used by `git diff`, `git log`, and `git show` and the UNIX mailbox
162// format used by `git format-patch`.
163//
164// When parsing mail-formatted headers, ParsePatchHeader tries to remove
165// email-specific content from the title and body:
166//
167// - Based on the SubjectCleanMode, remove prefixes like reply markers and
168// "[PATCH]" strings from the subject, saving any removed content in the
169// SubjectPrefix field. Parsing always discards leading and trailing
170// whitespace from the subject line. The default mode is SubjectCleanAll.
171//
172// - If the body contains a "---" line (3 hyphens), remove that line and any
173// content after it from the body and save it in the BodyAppendix field.
174//
175// ParsePatchHeader tries to process content it does not understand wthout
176// returning errors, but will return errors if well-identified content like
177// dates or identies uses unknown or invalid formats.
178func ParsePatchHeader(header string, options ...PatchHeaderOption) (*PatchHeader, error) {
179 opts := patchHeaderOptions{
180 subjectCleanMode: SubjectCleanAll, // match git defaults
181 }
182 for _, optFn := range options {
183 optFn(&opts)
184 }
185
186 header = strings.TrimSpace(header)
187 if header == "" {
188 return &PatchHeader{}, nil
189 }
190
191 var firstLine, rest string
192 if idx := strings.IndexByte(header, '\n'); idx >= 0 {
193 firstLine = header[:idx]
194 rest = header[idx+1:]
195 } else {
196 firstLine = header
197 rest = ""
198 }
199
200 switch {
201 case strings.HasPrefix(firstLine, mailHeaderPrefix):
202 return parseHeaderMail(firstLine, strings.NewReader(rest), opts)
203
204 case strings.HasPrefix(firstLine, mailMinimumHeaderPrefix):
205 // With a minimum header, the first line is part of the actual mail
206 // content and needs to be parsed as part of the "rest"
207 return parseHeaderMail("", strings.NewReader(header), opts)
208
209 case strings.HasPrefix(firstLine, prettyHeaderPrefix):
210 return parseHeaderPretty(firstLine, strings.NewReader(rest))
211 }
212
213 return nil, errors.New("unrecognized patch header format")
214}
215
216func parseHeaderPretty(prettyLine string, r io.Reader) (*PatchHeader, error) {
217 const (
218 authorPrefix = "Author:"
219 commitPrefix = "Commit:"
220 datePrefix = "Date:"
221 authorDatePrefix = "AuthorDate:"
222 commitDatePrefix = "CommitDate:"
223 )
224
225 h := &PatchHeader{}
226
227 prettyLine = strings.TrimPrefix(prettyLine, prettyHeaderPrefix)
228 if i := strings.IndexByte(prettyLine, ' '); i > 0 {
229 h.SHA = prettyLine[:i]
230 } else {
231 h.SHA = prettyLine
232 }
233
234 s := bufio.NewScanner(r)
235 for s.Scan() {
236 line := s.Text()
237
238 // empty line marks end of fields, remaining lines are title/message
239 if strings.TrimSpace(line) == "" {
240 break
241 }
242
243 items := strings.SplitN(line, ":", 2)
244
245 // we have "key: value"
246 if len(items) == 2 {
247 key := items[0]
248 val := items[1]
249 h.RawHeaders[key] = append(h.RawHeaders[key], val)
250 }
251
252 switch {
253 case strings.HasPrefix(line, authorPrefix):
254 u, err := ParsePatchIdentity(line[len(authorPrefix):])
255 if err != nil {
256 return nil, err
257 }
258 h.Author = &u
259
260 case strings.HasPrefix(line, commitPrefix):
261 u, err := ParsePatchIdentity(line[len(commitPrefix):])
262 if err != nil {
263 return nil, err
264 }
265 h.Committer = &u
266
267 case strings.HasPrefix(line, datePrefix):
268 d, err := ParsePatchDate(strings.TrimSpace(line[len(datePrefix):]))
269 if err != nil {
270 return nil, err
271 }
272 h.AuthorDate = d
273
274 case strings.HasPrefix(line, authorDatePrefix):
275 d, err := ParsePatchDate(strings.TrimSpace(line[len(authorDatePrefix):]))
276 if err != nil {
277 return nil, err
278 }
279 h.AuthorDate = d
280
281 case strings.HasPrefix(line, commitDatePrefix):
282 d, err := ParsePatchDate(strings.TrimSpace(line[len(commitDatePrefix):]))
283 if err != nil {
284 return nil, err
285 }
286 h.CommitterDate = d
287 }
288 }
289 if s.Err() != nil {
290 return nil, s.Err()
291 }
292
293 title, indent := scanMessageTitle(s)
294 if s.Err() != nil {
295 return nil, s.Err()
296 }
297 h.Title = title
298
299 if title != "" {
300 // Don't check for an appendix, pretty headers do not contain them
301 body, _ := scanMessageBody(s, indent, false)
302 if s.Err() != nil {
303 return nil, s.Err()
304 }
305 h.Body = body
306 }
307
308 return h, nil
309}
310
311func scanMessageTitle(s *bufio.Scanner) (title string, indent string) {
312 var b strings.Builder
313 for i := 0; s.Scan(); i++ {
314 line := s.Text()
315 trimLine := strings.TrimSpace(line)
316 if trimLine == "" {
317 break
318 }
319
320 if i == 0 {
321 if start := strings.IndexFunc(line, func(c rune) bool { return !unicode.IsSpace(c) }); start > 0 {
322 indent = line[:start]
323 }
324 }
325 if b.Len() > 0 {
326 b.WriteByte(' ')
327 }
328 b.WriteString(trimLine)
329 }
330 return b.String(), indent
331}
332
333func scanMessageBody(s *bufio.Scanner, indent string, separateAppendix bool) (string, string) {
334 // Body and appendix
335 var body, appendix strings.Builder
336 c := &body
337 var empty int
338 for i := 0; s.Scan(); i++ {
339 line := s.Text()
340
341 line = strings.TrimRightFunc(line, unicode.IsSpace)
342 line = strings.TrimPrefix(line, indent)
343
344 if line == "" {
345 empty++
346 continue
347 }
348
349 // If requested, parse out "appendix" information (often added
350 // by `git format-patch` and removed by `git am`).
351 if separateAppendix && c == &body && line == "---" {
352 c = &appendix
353 continue
354 }
355
356 if c.Len() > 0 {
357 c.WriteByte('\n')
358 if empty > 0 {
359 c.WriteByte('\n')
360 }
361 }
362 empty = 0
363
364 c.WriteString(line)
365 }
366 return body.String(), appendix.String()
367}
368
369func parseHeaderMail(mailLine string, r io.Reader, opts patchHeaderOptions) (*PatchHeader, error) {
370 msg, err := mail.ReadMessage(r)
371 if err != nil {
372 return nil, err
373 }
374
375 h := &PatchHeader{}
376 h.RawHeaders = msg.Header
377
378 if strings.HasPrefix(mailLine, mailHeaderPrefix) {
379 mailLine = strings.TrimPrefix(mailLine, mailHeaderPrefix)
380 if i := strings.IndexByte(mailLine, ' '); i > 0 {
381 h.SHA = mailLine[:i]
382 }
383 }
384
385 from := msg.Header.Get("From")
386 if from != "" {
387 u, err := ParsePatchIdentity(from)
388 if err != nil {
389 return nil, err
390 }
391 h.Author = &u
392 }
393
394 date := msg.Header.Get("Date")
395 if date != "" {
396 d, err := ParsePatchDate(date)
397 if err != nil {
398 return nil, err
399 }
400 h.AuthorDate = d
401 }
402
403 subject := msg.Header.Get("Subject")
404 h.SubjectPrefix, h.Title = cleanSubject(subject, opts.subjectCleanMode)
405
406 s := bufio.NewScanner(msg.Body)
407 h.Body, h.BodyAppendix = scanMessageBody(s, "", true)
408 if s.Err() != nil {
409 return nil, s.Err()
410 }
411
412 return h, nil
413}
414
415func cleanSubject(s string, mode SubjectCleanMode) (prefix string, subject string) {
416 switch mode {
417 case SubjectCleanAll, SubjectCleanPatchOnly:
418 case SubjectCleanWhitespace:
419 return "", strings.TrimSpace(decodeSubject(s))
420 default:
421 panic(fmt.Sprintf("unknown clean mode: %d", mode))
422 }
423
424 // Based on the algorithm from Git in mailinfo.c:cleanup_subject()
425 // If compatibility with `git am` drifts, go there to see if there are any updates.
426
427 at := 0
428 for at < len(s) {
429 switch s[at] {
430 case 'r', 'R':
431 // Detect re:, Re:, rE: and RE:
432 if at+2 < len(s) && (s[at+1] == 'e' || s[at+1] == 'E') && s[at+2] == ':' {
433 at += 3
434 continue
435 }
436
437 case ' ', '\t', ':':
438 // Delete whitespace and duplicate ':' characters
439 at++
440 continue
441
442 case '[':
443 if i := strings.IndexByte(s[at:], ']'); i > 0 {
444 if mode == SubjectCleanAll || strings.Contains(s[at:at+i+1], "PATCH") {
445 at += i + 1
446 continue
447 }
448 }
449 }
450
451 // Nothing was removed, end processing
452 break
453 }
454
455 prefix = strings.TrimLeftFunc(s[:at], unicode.IsSpace)
456 subject = strings.TrimRightFunc(decodeSubject(s[at:]), unicode.IsSpace)
457 return
458}
459
460// Decodes a subject line. Currently only supports quoted-printable UTF-8. This format is the result
461// of a `git format-patch` when the commit title has a non-ASCII character (i.e. an emoji).
462// See for reference: https://stackoverflow.com/questions/27695749/gmail-api-not-respecting-utf-encoding-in-subject
463func decodeSubject(encoded string) string {
464 if !strings.HasPrefix(encoded, "=?UTF-8?q?") {
465 // not UTF-8 encoded
466 return encoded
467 }
468
469 // If the subject is too long, `git format-patch` may produce a subject line across
470 // multiple lines. When parsed, this can look like the following:
471 // <UTF8-prefix><first-line> <UTF8-prefix><second-line>
472 payload := " " + encoded
473 payload = strings.ReplaceAll(payload, " =?UTF-8?q?", "")
474 payload = strings.ReplaceAll(payload, "?=", "")
475
476 decoded, err := ioutil.ReadAll(quotedprintable.NewReader(strings.NewReader(payload)))
477 if err != nil {
478 // if err, abort decoding and return original subject
479 return encoded
480 }
481
482 return string(decoded)
483}