2 * parse-diff.c: functions for parsing diff files
4 * ====================================================================
5 * Licensed to the Apache Software Foundation (ASF) under one
6 * or more contributor license agreements. See the NOTICE file
7 * distributed with this work for additional information
8 * regarding copyright ownership. The ASF licenses this file
9 * to you under the Apache License, Version 2.0 (the
10 * "License"); you may not use this file except in compliance
11 * with the License. You may obtain a copy of the License at
13 * http://www.apache.org/licenses/LICENSE-2.0
15 * Unless required by applicable law or agreed to in writing,
16 * software distributed under the License is distributed on an
17 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
18 * KIND, either express or implied. See the License for the
19 * specific language governing permissions and limitations
21 * ====================================================================
29 #include "svn_types.h"
30 #include "svn_error.h"
32 #include "svn_pools.h"
33 #include "svn_props.h"
34 #include "svn_string.h"
36 #include "svn_dirent_uri.h"
38 #include "svn_ctype.h"
39 #include "svn_mergeinfo.h"
41 #include "private/svn_eol_private.h"
42 #include "private/svn_dep_compat.h"
43 #include "private/svn_sorts_private.h"
45 /* Helper macro for readability */
46 #define starts_with(str, start) \
47 (strncmp((str), (start), strlen(start)) == 0)
49 /* Like strlen() but for string literals. */
50 #define STRLEN_LITERAL(str) (sizeof(str) - 1)
52 /* This struct describes a range within a file, as well as the
53 * current cursor position within the range. All numbers are in bytes. */
54 struct svn_diff__hunk_range {
60 struct svn_diff_hunk_t {
61 /* The patch this hunk belongs to. */
64 /* APR file handle to the patch file this hunk came from. */
67 /* Ranges used to keep track of this hunk's texts positions within
69 struct svn_diff__hunk_range diff_text_range;
70 struct svn_diff__hunk_range original_text_range;
71 struct svn_diff__hunk_range modified_text_range;
73 /* Hunk ranges as they appeared in the patch file.
74 * All numbers are lines, not bytes. */
75 svn_linenum_t original_start;
76 svn_linenum_t original_length;
77 svn_linenum_t modified_start;
78 svn_linenum_t modified_length;
80 /* Number of lines of leading and trailing hunk context. */
81 svn_linenum_t leading_context;
82 svn_linenum_t trailing_context;
86 svn_diff_hunk_reset_diff_text(svn_diff_hunk_t *hunk)
88 hunk->diff_text_range.current = hunk->diff_text_range.start;
92 svn_diff_hunk_reset_original_text(svn_diff_hunk_t *hunk)
94 if (hunk->patch->reverse)
95 hunk->modified_text_range.current = hunk->modified_text_range.start;
97 hunk->original_text_range.current = hunk->original_text_range.start;
101 svn_diff_hunk_reset_modified_text(svn_diff_hunk_t *hunk)
103 if (hunk->patch->reverse)
104 hunk->original_text_range.current = hunk->original_text_range.start;
106 hunk->modified_text_range.current = hunk->modified_text_range.start;
110 svn_diff_hunk_get_original_start(const svn_diff_hunk_t *hunk)
112 return hunk->patch->reverse ? hunk->modified_start : hunk->original_start;
116 svn_diff_hunk_get_original_length(const svn_diff_hunk_t *hunk)
118 return hunk->patch->reverse ? hunk->modified_length : hunk->original_length;
122 svn_diff_hunk_get_modified_start(const svn_diff_hunk_t *hunk)
124 return hunk->patch->reverse ? hunk->original_start : hunk->modified_start;
128 svn_diff_hunk_get_modified_length(const svn_diff_hunk_t *hunk)
130 return hunk->patch->reverse ? hunk->original_length : hunk->modified_length;
134 svn_diff_hunk_get_leading_context(const svn_diff_hunk_t *hunk)
136 return hunk->leading_context;
140 svn_diff_hunk_get_trailing_context(const svn_diff_hunk_t *hunk)
142 return hunk->trailing_context;
145 /* Try to parse a positive number from a decimal number encoded
146 * in the string NUMBER. Return parsed number in OFFSET, and return
147 * TRUE if parsing was successful. */
149 parse_offset(svn_linenum_t *offset, const char *number)
154 err = svn_cstring_strtoui64(&val, number, 0, SVN_LINENUM_MAX_VALUE, 10);
157 svn_error_clear(err);
161 *offset = (svn_linenum_t)val;
166 /* Try to parse a hunk range specification from the string RANGE.
167 * Return parsed information in *START and *LENGTH, and return TRUE
168 * if the range parsed correctly. Note: This function may modify the
169 * input value RANGE. */
171 parse_range(svn_linenum_t *start, svn_linenum_t *length, char *range)
178 comma = strstr(range, ",");
181 if (strlen(comma + 1) > 0)
183 /* Try to parse the length. */
184 if (! parse_offset(length, comma + 1))
187 /* Snip off the end of the string,
188 * so we can comfortably parse the line
189 * number the hunk starts at. */
193 /* A comma but no length? */
201 /* Try to parse the line number the hunk starts at. */
202 return parse_offset(start, range);
205 /* Try to parse a hunk header in string HEADER, putting parsed information
206 * into HUNK. Return TRUE if the header parsed correctly. ATAT is the
207 * character string used to delimit the hunk header.
208 * Do all allocations in POOL. */
210 parse_hunk_header(const char *header, svn_diff_hunk_t *hunk,
211 const char *atat, apr_pool_t *pool)
215 svn_stringbuf_t *range;
217 p = header + strlen(atat);
225 /* OK, this may be worth allocating some memory for... */
226 range = svn_stringbuf_create_ensure(31, pool);
228 while (*p && *p != ' ')
237 svn_stringbuf_appendbytes(range, start, p - start);
239 /* Try to parse the first range. */
240 if (! parse_range(&hunk->original_start, &hunk->original_length, range->data))
243 /* Clear the stringbuf so we can reuse it for the second range. */
244 svn_stringbuf_setempty(range);
249 /* OK, this may be worth copying... */
251 while (*p && *p != ' ')
259 svn_stringbuf_appendbytes(range, start, p - start);
261 /* Check for trailing @@ */
263 if (! starts_with(p, atat))
266 /* There may be stuff like C-function names after the trailing @@,
267 * but we ignore that. */
269 /* Try to parse the second range. */
270 if (! parse_range(&hunk->modified_start, &hunk->modified_length, range->data))
273 /* Hunk header is good. */
277 /* Read a line of original or modified hunk text from the specified
278 * RANGE within FILE. FILE is expected to contain unidiff text.
279 * Leading unidiff symbols ('+', '-', and ' ') are removed from the line,
280 * Any lines commencing with the VERBOTEN character are discarded.
281 * VERBOTEN should be '+' or '-', depending on which form of hunk text
284 * All other parameters are as in svn_diff_hunk_readline_original_text()
285 * and svn_diff_hunk_readline_modified_text().
288 hunk_readline_original_or_modified(apr_file_t *file,
289 struct svn_diff__hunk_range *range,
290 svn_stringbuf_t **stringbuf,
294 apr_pool_t *result_pool,
295 apr_pool_t *scratch_pool)
298 svn_boolean_t filtered;
300 svn_stringbuf_t *str;
302 if (range->current >= range->end)
304 /* We're past the range. Indicate that no bytes can be read. */
308 *stringbuf = svn_stringbuf_create_empty(result_pool);
313 SVN_ERR(svn_io_file_seek(file, APR_CUR, &pos, scratch_pool));
314 SVN_ERR(svn_io_file_seek(file, APR_SET, &range->current, scratch_pool));
317 max_len = range->end - range->current;
318 SVN_ERR(svn_io_file_readline(file, &str, eol, eof, max_len,
319 result_pool, scratch_pool));
321 SVN_ERR(svn_io_file_seek(file, APR_CUR, &range->current, scratch_pool));
322 filtered = (str->data[0] == verboten || str->data[0] == '\\');
324 while (filtered && ! *eof);
328 /* EOF, return an empty string. */
329 *stringbuf = svn_stringbuf_create_ensure(0, result_pool);
331 else if (str->data[0] == '+' || str->data[0] == '-' || str->data[0] == ' ')
333 /* Shave off leading unidiff symbols. */
334 *stringbuf = svn_stringbuf_create(str->data + 1, result_pool);
338 /* Return the line as-is. */
339 *stringbuf = svn_stringbuf_dup(str, result_pool);
342 SVN_ERR(svn_io_file_seek(file, APR_SET, &pos, scratch_pool));
348 svn_diff_hunk_readline_original_text(svn_diff_hunk_t *hunk,
349 svn_stringbuf_t **stringbuf,
352 apr_pool_t *result_pool,
353 apr_pool_t *scratch_pool)
355 return svn_error_trace(
356 hunk_readline_original_or_modified(hunk->apr_file,
357 hunk->patch->reverse ?
358 &hunk->modified_text_range :
359 &hunk->original_text_range,
361 hunk->patch->reverse ? '-' : '+',
362 result_pool, scratch_pool));
366 svn_diff_hunk_readline_modified_text(svn_diff_hunk_t *hunk,
367 svn_stringbuf_t **stringbuf,
370 apr_pool_t *result_pool,
371 apr_pool_t *scratch_pool)
373 return svn_error_trace(
374 hunk_readline_original_or_modified(hunk->apr_file,
375 hunk->patch->reverse ?
376 &hunk->original_text_range :
377 &hunk->modified_text_range,
379 hunk->patch->reverse ? '+' : '-',
380 result_pool, scratch_pool));
384 svn_diff_hunk_readline_diff_text(svn_diff_hunk_t *hunk,
385 svn_stringbuf_t **stringbuf,
388 apr_pool_t *result_pool,
389 apr_pool_t *scratch_pool)
391 svn_stringbuf_t *line;
395 if (hunk->diff_text_range.current >= hunk->diff_text_range.end)
397 /* We're past the range. Indicate that no bytes can be read. */
401 *stringbuf = svn_stringbuf_create_empty(result_pool);
406 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR, &pos, scratch_pool));
407 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET,
408 &hunk->diff_text_range.current, scratch_pool));
409 max_len = hunk->diff_text_range.end - hunk->diff_text_range.current;
410 SVN_ERR(svn_io_file_readline(hunk->apr_file, &line, eol, eof, max_len,
413 hunk->diff_text_range.current = 0;
414 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_CUR,
415 &hunk->diff_text_range.current, scratch_pool));
416 SVN_ERR(svn_io_file_seek(hunk->apr_file, APR_SET, &pos, scratch_pool));
418 if (hunk->patch->reverse)
420 if (line->data[0] == '+')
422 else if (line->data[0] == '-')
431 /* Parse *PROP_NAME from HEADER as the part after the INDICATOR line.
432 * Allocate *PROP_NAME in RESULT_POOL.
433 * Set *PROP_NAME to NULL if no valid property name was found. */
435 parse_prop_name(const char **prop_name, const char *header,
436 const char *indicator, apr_pool_t *result_pool)
438 SVN_ERR(svn_utf_cstring_to_utf8(prop_name,
439 header + strlen(indicator),
441 if (**prop_name == '\0')
443 else if (! svn_prop_name_is_valid(*prop_name))
445 svn_stringbuf_t *buf = svn_stringbuf_create(*prop_name, result_pool);
446 svn_stringbuf_strip_whitespace(buf);
447 *prop_name = (svn_prop_name_is_valid(buf->data) ? buf->data : NULL);
454 /* A helper function to parse svn:mergeinfo diffs.
456 * These diffs use a special pretty-print format, for instance:
458 * Added: svn:mergeinfo
462 * The hunk header has the following format:
463 * ## -0,NUMBER_OF_REVERSE_MERGES +0,NUMBER_OF_FORWARD_MERGES ##
465 * At this point, the number of reverse merges has already been
466 * parsed into HUNK->ORIGINAL_LENGTH, and the number of forward
467 * merges has been parsed into HUNK->MODIFIED_LENGTH.
469 * The header is followed by a list of mergeinfo, one path per line.
470 * This function parses such lines. Lines describing reverse merges
471 * appear first, and then all lines describing forward merges appear.
473 * Parts of the line are affected by i18n. The words 'Merged'
474 * and 'Reverse-merged' can appear in any language and at any
475 * position within the line. We can only assume that a leading
476 * '/' starts the merge source path, the path is followed by
477 * ":r", which in turn is followed by a mergeinfo revision range,
478 * which is terminated by whitespace or end-of-string.
480 * If the current line meets the above criteria and we're able
481 * to parse valid mergeinfo from it, the resulting mergeinfo
482 * is added to patch->mergeinfo or patch->reverse_mergeinfo,
483 * and we proceed to the next line.
486 parse_mergeinfo(svn_boolean_t *found_mergeinfo,
487 svn_stringbuf_t *line,
488 svn_diff_hunk_t *hunk,
490 apr_pool_t *result_pool,
491 apr_pool_t *scratch_pool)
493 char *slash = strchr(line->data, '/');
494 char *colon = strrchr(line->data, ':');
496 *found_mergeinfo = FALSE;
498 if (slash && colon && colon[1] == 'r' && slash < colon)
500 svn_stringbuf_t *input;
501 svn_mergeinfo_t mergeinfo = NULL;
505 input = svn_stringbuf_create_ensure(line->len, scratch_pool);
507 /* Copy the merge source path + colon */
511 svn_stringbuf_appendbyte(input, *s);
515 /* skip 'r' after colon */
518 /* Copy the revision range. */
519 while (s < line->data + line->len)
521 if (svn_ctype_isspace(*s))
523 svn_stringbuf_appendbyte(input, *s);
527 err = svn_mergeinfo_parse(&mergeinfo, input->data, result_pool);
528 if (err && err->apr_err == SVN_ERR_MERGEINFO_PARSE_ERROR)
530 svn_error_clear(err);
538 if (hunk->original_length > 0) /* reverse merges */
542 if (patch->mergeinfo == NULL)
543 patch->mergeinfo = mergeinfo;
545 SVN_ERR(svn_mergeinfo_merge2(patch->mergeinfo,
552 if (patch->reverse_mergeinfo == NULL)
553 patch->reverse_mergeinfo = mergeinfo;
555 SVN_ERR(svn_mergeinfo_merge2(patch->reverse_mergeinfo,
560 hunk->original_length--;
562 else if (hunk->modified_length > 0) /* forward merges */
566 if (patch->reverse_mergeinfo == NULL)
567 patch->reverse_mergeinfo = mergeinfo;
569 SVN_ERR(svn_mergeinfo_merge2(patch->reverse_mergeinfo,
576 if (patch->mergeinfo == NULL)
577 patch->mergeinfo = mergeinfo;
579 SVN_ERR(svn_mergeinfo_merge2(patch->mergeinfo,
584 hunk->modified_length--;
587 *found_mergeinfo = TRUE;
594 /* Return the next *HUNK from a PATCH in APR_FILE.
595 * If no hunk can be found, set *HUNK to NULL.
596 * Set IS_PROPERTY to TRUE if we have a property hunk. If the returned HUNK
597 * is the first belonging to a certain property, then PROP_NAME and
598 * PROP_OPERATION will be set too. If we have a text hunk, PROP_NAME will be
599 * NULL. If IGNORE_WHITESPACE is TRUE, lines without leading spaces will be
600 * treated as context lines. Allocate results in RESULT_POOL.
601 * Use SCRATCH_POOL for all other allocations. */
603 parse_next_hunk(svn_diff_hunk_t **hunk,
604 svn_boolean_t *is_property,
605 const char **prop_name,
606 svn_diff_operation_kind_t *prop_operation,
608 apr_file_t *apr_file,
609 svn_boolean_t ignore_whitespace,
610 apr_pool_t *result_pool,
611 apr_pool_t *scratch_pool)
613 static const char * const minus = "--- ";
614 static const char * const text_atat = "@@";
615 static const char * const prop_atat = "##";
616 svn_stringbuf_t *line;
617 svn_boolean_t eof, in_hunk, hunk_seen;
618 apr_off_t pos, last_line;
619 apr_off_t start, end;
620 apr_off_t original_end;
621 apr_off_t modified_end;
622 svn_linenum_t original_lines;
623 svn_linenum_t modified_lines;
624 svn_linenum_t leading_context;
625 svn_linenum_t trailing_context;
626 svn_boolean_t changed_line_seen;
633 apr_pool_t *iterpool;
635 *prop_operation = svn_diff_op_unchanged;
637 /* We only set this if we have a property hunk header. */
639 *is_property = FALSE;
641 if (apr_file_eof(apr_file) == APR_EOF)
643 /* No more hunks here. */
651 trailing_context = 0;
652 changed_line_seen = FALSE;
655 *hunk = apr_pcalloc(result_pool, sizeof(**hunk));
657 /* Get current seek position -- APR has no ftell() :( */
659 SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, scratch_pool));
661 /* Start out assuming noise. */
662 last_line_type = noise_line;
664 iterpool = svn_pool_create(scratch_pool);
668 svn_pool_clear(iterpool);
670 /* Remember the current line's offset, and read the line. */
672 SVN_ERR(svn_io_file_readline(apr_file, &line, NULL, &eof, APR_SIZE_MAX,
673 iterpool, iterpool));
675 /* Update line offset for next iteration. */
677 SVN_ERR(svn_io_file_seek(apr_file, APR_CUR, &pos, iterpool));
679 /* Lines starting with a backslash indicate a missing EOL:
680 * "\ No newline at end of file" or "end of property". */
681 if (line->data[0] == '\\')
688 apr_off_t hunk_text_end;
690 /* Comment terminates the hunk text and says the hunk text
691 * has no trailing EOL. Snip off trailing EOL which is part
692 * of the patch file but not part of the hunk text. */
694 SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &off, iterpool));
695 len = sizeof(eolbuf);
696 SVN_ERR(svn_io_file_read_full2(apr_file, eolbuf, len, &len,
698 if (eolbuf[0] == '\r' && eolbuf[1] == '\n')
699 hunk_text_end = last_line - 2;
700 else if (eolbuf[1] == '\n' || eolbuf[1] == '\r')
701 hunk_text_end = last_line - 1;
703 hunk_text_end = last_line;
705 if (last_line_type == original_line && original_end == 0)
706 original_end = hunk_text_end;
707 else if (last_line_type == modified_line && modified_end == 0)
708 modified_end = hunk_text_end;
709 else if (last_line_type == context_line)
711 if (original_end == 0)
712 original_end = hunk_text_end;
713 if (modified_end == 0)
714 modified_end = hunk_text_end;
717 SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &pos, iterpool));
723 if (in_hunk && *is_property && *prop_name &&
724 strcmp(*prop_name, SVN_PROP_MERGEINFO) == 0)
726 svn_boolean_t found_mergeinfo;
728 SVN_ERR(parse_mergeinfo(&found_mergeinfo, line, *hunk, patch,
729 result_pool, iterpool));
731 continue; /* Proceed to the next line in the patch. */
737 static const char add = '+';
738 static const char del = '-';
742 /* We're reading the first line of the hunk, so the start
743 * of the line just read is the hunk text's byte offset. */
748 if (original_lines > 0 && modified_lines > 0 &&
750 /* Tolerate chopped leading spaces on empty lines. */
751 || (! eof && line->len == 0)
752 /* Maybe tolerate chopped leading spaces on non-empty lines. */
753 || (ignore_whitespace && c != del && c != add)))
755 /* It's a "context" line in the hunk. */
759 if (changed_line_seen)
763 last_line_type = context_line;
765 else if (original_lines > 0 && c == del)
767 /* It's a "deleted" line in the hunk. */
769 changed_line_seen = TRUE;
771 /* A hunk may have context in the middle. We only want
772 trailing lines of context. */
773 if (trailing_context > 0)
774 trailing_context = 0;
777 last_line_type = original_line;
779 else if (modified_lines > 0 && c == add)
781 /* It's an "added" line in the hunk. */
783 changed_line_seen = TRUE;
785 /* A hunk may have context in the middle. We only want
786 trailing lines of context. */
787 if (trailing_context > 0)
788 trailing_context = 0;
791 last_line_type = modified_line;
797 /* The hunk ends at EOF. */
802 /* The start of the current line marks the first byte
803 * after the hunk text. */
807 if (original_end == 0)
809 if (modified_end == 0)
811 break; /* Hunk was empty or has been read. */
816 if (starts_with(line->data, text_atat))
818 /* Looks like we have a hunk header, try to rip it apart. */
819 in_hunk = parse_hunk_header(line->data, *hunk, text_atat,
823 original_lines = (*hunk)->original_length;
824 modified_lines = (*hunk)->modified_length;
825 *is_property = FALSE;
828 else if (starts_with(line->data, prop_atat))
830 /* Looks like we have a property hunk header, try to rip it
832 in_hunk = parse_hunk_header(line->data, *hunk, prop_atat,
836 original_lines = (*hunk)->original_length;
837 modified_lines = (*hunk)->modified_length;
841 else if (starts_with(line->data, "Added: "))
843 SVN_ERR(parse_prop_name(prop_name, line->data, "Added: ",
846 *prop_operation = svn_diff_op_added;
848 else if (starts_with(line->data, "Deleted: "))
850 SVN_ERR(parse_prop_name(prop_name, line->data, "Deleted: ",
853 *prop_operation = svn_diff_op_deleted;
855 else if (starts_with(line->data, "Modified: "))
857 SVN_ERR(parse_prop_name(prop_name, line->data, "Modified: ",
860 *prop_operation = svn_diff_op_modified;
862 else if (starts_with(line->data, minus)
863 || starts_with(line->data, "diff --git "))
864 /* This could be a header of another patch. Bail out. */
868 /* Check for the line length since a file may not have a newline at the
869 * end and we depend upon the last line to be an empty one. */
870 while (! eof || line->len > 0);
871 svn_pool_destroy(iterpool);
874 /* Rewind to the start of the line just read, so subsequent calls
875 * to this function or svn_diff_parse_next_patch() don't end
876 * up skipping the line -- it may contain a patch or hunk header. */
877 SVN_ERR(svn_io_file_seek(apr_file, APR_SET, &last_line, scratch_pool));
879 if (hunk_seen && start < end)
881 (*hunk)->patch = patch;
882 (*hunk)->apr_file = apr_file;
883 (*hunk)->leading_context = leading_context;
884 (*hunk)->trailing_context = trailing_context;
885 (*hunk)->diff_text_range.start = start;
886 (*hunk)->diff_text_range.current = start;
887 (*hunk)->diff_text_range.end = end;
888 (*hunk)->original_text_range.start = start;
889 (*hunk)->original_text_range.current = start;
890 (*hunk)->original_text_range.end = original_end;
891 (*hunk)->modified_text_range.start = start;
892 (*hunk)->modified_text_range.current = start;
893 (*hunk)->modified_text_range.end = modified_end;
896 /* Something went wrong, just discard the result. */
902 /* Compare function for sorting hunks after parsing.
903 * We sort hunks by their original line offset. */
905 compare_hunks(const void *a, const void *b)
907 const svn_diff_hunk_t *ha = *((const svn_diff_hunk_t *const *)a);
908 const svn_diff_hunk_t *hb = *((const svn_diff_hunk_t *const *)b);
910 if (ha->original_start < hb->original_start)
912 if (ha->original_start > hb->original_start)
917 /* Possible states of the diff header parser. */
920 state_start, /* initial */
921 state_git_diff_seen, /* diff --git */
922 state_git_tree_seen, /* a tree operation, rather then content change */
923 state_git_minus_seen, /* --- /dev/null; or --- a/ */
924 state_git_plus_seen, /* +++ /dev/null; or +++ a/ */
925 state_move_from_seen, /* rename from foo.c */
926 state_copy_from_seen, /* copy from foo.c */
927 state_minus_seen, /* --- foo.c */
928 state_unidiff_found, /* valid start of a regular unidiff header */
929 state_git_header_found /* valid start of a --git diff header */
932 /* Data type describing a valid state transition of the parser. */
935 const char *expected_input;
936 enum parse_state required_state;
938 /* A callback called upon each parser state transition. */
939 svn_error_t *(*fn)(enum parse_state *new_state, char *input,
940 svn_patch_t *patch, apr_pool_t *result_pool,
941 apr_pool_t *scratch_pool);
944 /* UTF-8 encode and canonicalize the content of LINE as FILE_NAME. */
946 grab_filename(const char **file_name, const char *line, apr_pool_t *result_pool,
947 apr_pool_t *scratch_pool)
949 const char *utf8_path;
950 const char *canon_path;
952 /* Grab the filename and encode it in UTF-8. */
953 /* TODO: Allow specifying the patch file's encoding.
954 * For now, we assume its encoding is native. */
955 /* ### This can fail if the filename cannot be represented in the current
956 * ### locale's encoding. */
957 SVN_ERR(svn_utf_cstring_to_utf8(&utf8_path,
961 /* Canonicalize the path name. */
962 canon_path = svn_dirent_canonicalize(utf8_path, scratch_pool);
964 *file_name = apr_pstrdup(result_pool, canon_path);
969 /* Parse the '--- ' line of a regular unidiff. */
971 diff_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
972 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
974 /* If we can find a tab, it separates the filename from
975 * the rest of the line which we can discard. */
976 char *tab = strchr(line, '\t');
980 SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- "),
981 result_pool, scratch_pool));
983 *new_state = state_minus_seen;
988 /* Parse the '+++ ' line of a regular unidiff. */
990 diff_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
991 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
993 /* If we can find a tab, it separates the filename from
994 * the rest of the line which we can discard. */
995 char *tab = strchr(line, '\t');
999 SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ "),
1000 result_pool, scratch_pool));
1002 *new_state = state_unidiff_found;
1004 return SVN_NO_ERROR;
1007 /* Parse the first line of a git extended unidiff. */
1008 static svn_error_t *
1009 git_start(enum parse_state *new_state, char *line, svn_patch_t *patch,
1010 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1012 const char *old_path_start;
1014 const char *new_path_start;
1015 const char *new_path_end;
1016 char *new_path_marker;
1017 const char *old_path_marker;
1019 /* ### Add handling of escaped paths
1020 * http://www.kernel.org/pub/software/scm/git/docs/git-diff.html:
1022 * TAB, LF, double quote and backslash characters in pathnames are
1023 * represented as \t, \n, \" and \\, respectively. If there is need for
1024 * such substitution then the whole pathname is put in double quotes.
1027 /* Our line should look like this: 'diff --git a/path b/path'.
1029 * If we find any deviations from that format, we return with state reset
1032 old_path_marker = strstr(line, " a/");
1034 if (! old_path_marker)
1036 *new_state = state_start;
1037 return SVN_NO_ERROR;
1040 if (! *(old_path_marker + 3))
1042 *new_state = state_start;
1043 return SVN_NO_ERROR;
1046 new_path_marker = strstr(old_path_marker, " b/");
1048 if (! new_path_marker)
1050 *new_state = state_start;
1051 return SVN_NO_ERROR;
1054 if (! *(new_path_marker + 3))
1056 *new_state = state_start;
1057 return SVN_NO_ERROR;
1060 /* By now, we know that we have a line on the form '--git diff a/.+ b/.+'
1061 * We only need the filenames when we have deleted or added empty
1062 * files. In those cases the old_path and new_path is identical on the
1063 * 'diff --git' line. For all other cases we fetch the filenames from
1064 * other header lines. */
1065 old_path_start = line + STRLEN_LITERAL("diff --git a/");
1066 new_path_end = line + strlen(line);
1067 new_path_start = old_path_start;
1074 new_path_marker = strstr(new_path_start, " b/");
1076 /* No new path marker, bail out. */
1077 if (! new_path_marker)
1080 old_path_end = new_path_marker;
1081 new_path_start = new_path_marker + STRLEN_LITERAL(" b/");
1083 /* No path after the marker. */
1084 if (! *new_path_start)
1087 len_old = old_path_end - old_path_start;
1088 len_new = new_path_end - new_path_start;
1090 /* Are the paths before and after the " b/" marker the same? */
1091 if (len_old == len_new
1092 && ! strncmp(old_path_start, new_path_start, len_old))
1094 *old_path_end = '\0';
1095 SVN_ERR(grab_filename(&patch->old_filename, old_path_start,
1096 result_pool, scratch_pool));
1098 SVN_ERR(grab_filename(&patch->new_filename, new_path_start,
1099 result_pool, scratch_pool));
1104 /* We assume that the path is only modified until we've found a 'tree'
1106 patch->operation = svn_diff_op_modified;
1108 *new_state = state_git_diff_seen;
1109 return SVN_NO_ERROR;
1112 /* Parse the '--- ' line of a git extended unidiff. */
1113 static svn_error_t *
1114 git_minus(enum parse_state *new_state, char *line, svn_patch_t *patch,
1115 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1117 /* If we can find a tab, it separates the filename from
1118 * the rest of the line which we can discard. */
1119 char *tab = strchr(line, '\t');
1123 if (starts_with(line, "--- /dev/null"))
1124 SVN_ERR(grab_filename(&patch->old_filename, "/dev/null",
1125 result_pool, scratch_pool));
1127 SVN_ERR(grab_filename(&patch->old_filename, line + STRLEN_LITERAL("--- a/"),
1128 result_pool, scratch_pool));
1130 *new_state = state_git_minus_seen;
1131 return SVN_NO_ERROR;
1134 /* Parse the '+++ ' line of a git extended unidiff. */
1135 static svn_error_t *
1136 git_plus(enum parse_state *new_state, char *line, svn_patch_t *patch,
1137 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1139 /* If we can find a tab, it separates the filename from
1140 * the rest of the line which we can discard. */
1141 char *tab = strchr(line, '\t');
1145 if (starts_with(line, "+++ /dev/null"))
1146 SVN_ERR(grab_filename(&patch->new_filename, "/dev/null",
1147 result_pool, scratch_pool));
1149 SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("+++ b/"),
1150 result_pool, scratch_pool));
1152 *new_state = state_git_header_found;
1153 return SVN_NO_ERROR;
1156 /* Parse the 'rename from ' line of a git extended unidiff. */
1157 static svn_error_t *
1158 git_move_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
1159 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1161 SVN_ERR(grab_filename(&patch->old_filename,
1162 line + STRLEN_LITERAL("rename from "),
1163 result_pool, scratch_pool));
1165 *new_state = state_move_from_seen;
1166 return SVN_NO_ERROR;
1169 /* Parse the 'rename to ' line of a git extended unidiff. */
1170 static svn_error_t *
1171 git_move_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
1172 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1174 SVN_ERR(grab_filename(&patch->new_filename,
1175 line + STRLEN_LITERAL("rename to "),
1176 result_pool, scratch_pool));
1178 patch->operation = svn_diff_op_moved;
1180 *new_state = state_git_tree_seen;
1181 return SVN_NO_ERROR;
1184 /* Parse the 'copy from ' line of a git extended unidiff. */
1185 static svn_error_t *
1186 git_copy_from(enum parse_state *new_state, char *line, svn_patch_t *patch,
1187 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1189 SVN_ERR(grab_filename(&patch->old_filename,
1190 line + STRLEN_LITERAL("copy from "),
1191 result_pool, scratch_pool));
1193 *new_state = state_copy_from_seen;
1194 return SVN_NO_ERROR;
1197 /* Parse the 'copy to ' line of a git extended unidiff. */
1198 static svn_error_t *
1199 git_copy_to(enum parse_state *new_state, char *line, svn_patch_t *patch,
1200 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1202 SVN_ERR(grab_filename(&patch->new_filename, line + STRLEN_LITERAL("copy to "),
1203 result_pool, scratch_pool));
1205 patch->operation = svn_diff_op_copied;
1207 *new_state = state_git_tree_seen;
1208 return SVN_NO_ERROR;
1211 /* Parse the 'new file ' line of a git extended unidiff. */
1212 static svn_error_t *
1213 git_new_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
1214 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1216 patch->operation = svn_diff_op_added;
1218 /* Filename already retrieved from diff --git header. */
1220 *new_state = state_git_tree_seen;
1221 return SVN_NO_ERROR;
1224 /* Parse the 'deleted file ' line of a git extended unidiff. */
1225 static svn_error_t *
1226 git_deleted_file(enum parse_state *new_state, char *line, svn_patch_t *patch,
1227 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1229 patch->operation = svn_diff_op_deleted;
1231 /* Filename already retrieved from diff --git header. */
1233 *new_state = state_git_tree_seen;
1234 return SVN_NO_ERROR;
1237 /* Add a HUNK associated with the property PROP_NAME to PATCH. */
1238 static svn_error_t *
1239 add_property_hunk(svn_patch_t *patch, const char *prop_name,
1240 svn_diff_hunk_t *hunk, svn_diff_operation_kind_t operation,
1241 apr_pool_t *result_pool)
1243 svn_prop_patch_t *prop_patch;
1245 prop_patch = svn_hash_gets(patch->prop_patches, prop_name);
1249 prop_patch = apr_palloc(result_pool, sizeof(svn_prop_patch_t));
1250 prop_patch->name = prop_name;
1251 prop_patch->operation = operation;
1252 prop_patch->hunks = apr_array_make(result_pool, 1,
1253 sizeof(svn_diff_hunk_t *));
1255 svn_hash_sets(patch->prop_patches, prop_name, prop_patch);
1258 APR_ARRAY_PUSH(prop_patch->hunks, svn_diff_hunk_t *) = hunk;
1260 return SVN_NO_ERROR;
1263 struct svn_patch_file_t
1265 /* The APR file handle to the patch file. */
1266 apr_file_t *apr_file;
1268 /* The file offset at which the next patch is expected. */
1269 apr_off_t next_patch_offset;
1273 svn_diff_open_patch_file(svn_patch_file_t **patch_file,
1274 const char *local_abspath,
1275 apr_pool_t *result_pool)
1277 svn_patch_file_t *p;
1279 p = apr_palloc(result_pool, sizeof(*p));
1280 SVN_ERR(svn_io_file_open(&p->apr_file, local_abspath,
1281 APR_READ | APR_BUFFERED, APR_OS_DEFAULT,
1283 p->next_patch_offset = 0;
1286 return SVN_NO_ERROR;
1289 /* Parse hunks from APR_FILE and store them in PATCH->HUNKS.
1290 * Parsing stops if no valid next hunk can be found.
1291 * If IGNORE_WHITESPACE is TRUE, lines without
1292 * leading spaces will be treated as context lines.
1293 * Allocate results in RESULT_POOL.
1294 * Use SCRATCH_POOL for temporary allocations. */
1295 static svn_error_t *
1296 parse_hunks(svn_patch_t *patch, apr_file_t *apr_file,
1297 svn_boolean_t ignore_whitespace,
1298 apr_pool_t *result_pool, apr_pool_t *scratch_pool)
1300 svn_diff_hunk_t *hunk;
1301 svn_boolean_t is_property;
1302 const char *last_prop_name;
1303 const char *prop_name;
1304 svn_diff_operation_kind_t prop_operation;
1305 apr_pool_t *iterpool;
1307 last_prop_name = NULL;
1309 patch->hunks = apr_array_make(result_pool, 10, sizeof(svn_diff_hunk_t *));
1310 patch->prop_patches = apr_hash_make(result_pool);
1311 iterpool = svn_pool_create(scratch_pool);
1314 svn_pool_clear(iterpool);
1316 SVN_ERR(parse_next_hunk(&hunk, &is_property, &prop_name, &prop_operation,
1317 patch, apr_file, ignore_whitespace, result_pool,
1320 if (hunk && is_property)
1323 prop_name = last_prop_name;
1325 last_prop_name = prop_name;
1327 /* Skip svn:mergeinfo properties.
1328 * Mergeinfo data cannot be represented as a hunk and
1329 * is therefore stored in PATCH itself. */
1330 if (strcmp(prop_name, SVN_PROP_MERGEINFO) == 0)
1333 SVN_ERR(add_property_hunk(patch, prop_name, hunk, prop_operation,
1338 APR_ARRAY_PUSH(patch->hunks, svn_diff_hunk_t *) = hunk;
1339 last_prop_name = NULL;
1344 svn_pool_destroy(iterpool);
1346 return SVN_NO_ERROR;
1349 /* State machine for the diff header parser.
1350 * Expected Input Required state Function to call */
1351 static struct transition transitions[] =
1353 {"--- ", state_start, diff_minus},
1354 {"+++ ", state_minus_seen, diff_plus},
1355 {"diff --git", state_start, git_start},
1356 {"--- a/", state_git_diff_seen, git_minus},
1357 {"--- a/", state_git_tree_seen, git_minus},
1358 {"--- /dev/null", state_git_tree_seen, git_minus},
1359 {"+++ b/", state_git_minus_seen, git_plus},
1360 {"+++ /dev/null", state_git_minus_seen, git_plus},
1361 {"rename from ", state_git_diff_seen, git_move_from},
1362 {"rename to ", state_move_from_seen, git_move_to},
1363 {"copy from ", state_git_diff_seen, git_copy_from},
1364 {"copy to ", state_copy_from_seen, git_copy_to},
1365 {"new file ", state_git_diff_seen, git_new_file},
1366 {"deleted file ", state_git_diff_seen, git_deleted_file},
1370 svn_diff_parse_next_patch(svn_patch_t **patch_p,
1371 svn_patch_file_t *patch_file,
1372 svn_boolean_t reverse,
1373 svn_boolean_t ignore_whitespace,
1374 apr_pool_t *result_pool,
1375 apr_pool_t *scratch_pool)
1377 apr_off_t pos, last_line;
1379 svn_boolean_t line_after_tree_header_read = FALSE;
1380 apr_pool_t *iterpool;
1382 enum parse_state state = state_start;
1384 if (apr_file_eof(patch_file->apr_file) == APR_EOF)
1386 /* No more patches here. */
1388 return SVN_NO_ERROR;
1391 patch = apr_pcalloc(result_pool, sizeof(*patch));
1393 pos = patch_file->next_patch_offset;
1394 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &pos, scratch_pool));
1396 iterpool = svn_pool_create(scratch_pool);
1399 svn_stringbuf_t *line;
1400 svn_boolean_t valid_header_line = FALSE;
1403 svn_pool_clear(iterpool);
1405 /* Remember the current line's offset, and read the line. */
1407 SVN_ERR(svn_io_file_readline(patch_file->apr_file, &line, NULL, &eof,
1408 APR_SIZE_MAX, iterpool, iterpool));
1412 /* Update line offset for next iteration. */
1414 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR, &pos,
1418 /* Run the state machine. */
1419 for (i = 0; i < (sizeof(transitions) / sizeof(transitions[0])); i++)
1421 if (starts_with(line->data, transitions[i].expected_input)
1422 && state == transitions[i].required_state)
1424 SVN_ERR(transitions[i].fn(&state, line->data, patch,
1425 result_pool, iterpool));
1426 valid_header_line = TRUE;
1431 if (state == state_unidiff_found || state == state_git_header_found)
1433 /* We have a valid diff header, yay! */
1436 else if (state == state_git_tree_seen && line_after_tree_header_read)
1438 /* git patches can contain an index line after the file mode line */
1439 if (!starts_with(line->data, "index "))
1441 /* We have a valid diff header for a patch with only tree changes.
1442 * Rewind to the start of the line just read, so subsequent calls
1443 * to this function don't end up skipping the line -- it may
1444 * contain a patch. */
1445 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
1450 else if (state == state_git_tree_seen)
1452 line_after_tree_header_read = TRUE;
1454 else if (! valid_header_line && state != state_start
1455 && state != state_git_diff_seen
1456 && !starts_with(line->data, "index "))
1458 /* We've encountered an invalid diff header.
1460 * Rewind to the start of the line just read - it may be a new
1461 * header that begins there. */
1462 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_SET, &last_line,
1464 state = state_start;
1470 patch->reverse = reverse;
1474 temp = patch->old_filename;
1475 patch->old_filename = patch->new_filename;
1476 patch->new_filename = temp;
1479 if (patch->old_filename == NULL || patch->new_filename == NULL)
1481 /* Something went wrong, just discard the result. */
1485 SVN_ERR(parse_hunks(patch, patch_file->apr_file, ignore_whitespace,
1486 result_pool, iterpool));
1488 svn_pool_destroy(iterpool);
1490 patch_file->next_patch_offset = 0;
1491 SVN_ERR(svn_io_file_seek(patch_file->apr_file, APR_CUR,
1492 &patch_file->next_patch_offset, scratch_pool));
1496 /* Usually, hunks appear in the patch sorted by their original line
1497 * offset. But just in case they weren't parsed in this order for
1498 * some reason, we sort them so that our caller can assume that hunks
1499 * are sorted as if parsed from a usual patch. */
1500 svn_sort__array(patch->hunks, compare_hunks);
1504 return SVN_NO_ERROR;
1508 svn_diff_close_patch_file(svn_patch_file_t *patch_file,
1509 apr_pool_t *scratch_pool)
1511 return svn_error_trace(svn_io_file_close(patch_file->apr_file,