-
+ 7C13EDE685F2C92F24A015E44B9609494E4D5DA9BC6A192B62A9CAB92F36C2543F5092FDADD7436FEAC45CA0AB54D19EA0692848C5C94F9819B885BEEC08E179
vtools/src/analyze.c
(0 . 0)(1 . 597)
2615
2616 #include "diff.h"
2617 #include <cmpbuf.h>
2618 #include <error.h>
2619 #include <xalloc.h>
2620 #include <stdlib.h>
2621
2622 /* The core of the Diff algorithm. */
2623 #define ELEMENT lin
2624 #define EQUAL(x, y) ((x) == (y))
2625 #define OFFSET lin
2626 #define EXTRA_CONTEXT_FIELDS /* none */
2627 #define NOTE_DELETE(c, xoff) (files[0].changed[files[0].realindexes[xoff]] = 1)
2628 #define NOTE_INSERT(c, yoff) (files[1].changed[files[1].realindexes[yoff]] = 1)
2629 #define USE_HEURISTIC 1
2630 #include <limits.h>
2631 #include <diffseq.h>
2632 #include <sys/stat.h>
2633
2634 /* Discard lines from one file that have no matches in the other file.
2635
2636 A line which is discarded will not be considered by the actual
2637 comparison algorithm; it will be as if that line were not in the
2638 file. The file's |realindexes| table maps virtual line numbers
2639 (which don't count the discarded lines) into real line numbers;
2640 this is how the actual comparison algorithm produces results that
2641 are comprehensible when the discarded lines are counted.
2642
2643 When we discard a line, we also mark it as a deletion or insertion
2644 so that it will be printed in the output. */
2645
2646 static void
2647 discard_confusing_lines(struct file_data filevec[]) {
2648 int f;
2649 lin i;
2650 char *discarded[2];
2651 lin *equiv_count[2];
2652 lin *p;
2653
2654 /* Allocate our results. */
2655 p = xmalloc((filevec[0].buffered_lines + filevec[1].buffered_lines)
2656 * (2 * sizeof *p));
2657 for (f = 0; f < 2; f++) {
2658 filevec[f].undiscarded = p;
2659 p += filevec[f].buffered_lines;
2660 filevec[f].realindexes = p;
2661 p += filevec[f].buffered_lines;
2662 }
2663
2664 /* Set up |equiv_count[f][i]| as the number of lines in file |f|
2665 that fall in equivalence class |i|. */
2666
2667 p = zalloc(filevec[0].equiv_max * (2 * sizeof *p));
2668 equiv_count[0] = p;
2669 equiv_count[1] = p + filevec[0].equiv_max;
2670
2671 for (i = 0; i < filevec[0].buffered_lines; ++i)
2672 ++equiv_count[0][filevec[0].equivs[i]];
2673 for (i = 0; i < filevec[1].buffered_lines; ++i)
2674 ++equiv_count[1][filevec[1].equivs[i]];
2675
2676 /* Set up tables of which lines are going to be discarded. */
2677
2678 discarded[0] = zalloc(filevec[0].buffered_lines
2679 + filevec[1].buffered_lines);
2680 discarded[1] = discarded[0] + filevec[0].buffered_lines;
2681
2682 /* Mark to be discarded each line that matches no line of the
2683 other file. If a line matches many lines, mark it as
2684 provisionally discardable. */
2685
2686 for (f = 0; f < 2; f++) {
2687 size_t end = filevec[f].buffered_lines;
2688 char *discards = discarded[f];
2689 lin *counts = equiv_count[1 - f];
2690 lin *equivs = filevec[f].equivs;
2691 size_t many = 5;
2692 size_t tem = end / 64;
2693
2694 /* Multiply |many| by approximate square root of number of
2695 lines. That is the threshold for provisionally discardable
2696 lines. */
2697 while ((tem = tem >> 2) > 0)
2698 many *= 2;
2699
2700 for (i = 0; i < end; i++) {
2701 lin nmatch;
2702 if (equivs[i] == 0)
2703 continue;
2704 nmatch = counts[equivs[i]];
2705 if (nmatch == 0)
2706 discards[i] = 1;
2707 else if (nmatch > many)
2708 discards[i] = 2;
2709 }
2710 }
2711
2712 /* Don't really discard the provisional lines except when they
2713 occur in a run of discardables, with nonprovisionals at the
2714 beginning and end. */
2715
2716 for (f = 0; f < 2; f++) {
2717 lin end = filevec[f].buffered_lines;
2718 register char *discards = discarded[f];
2719
2720 for (i = 0; i < end; i++) {
2721 /* Cancel provisional discards not in middle of run of
2722 discards. */
2723 if (discards[i] == 2)
2724 discards[i] = 0;
2725 else if (discards[i] != 0) {
2726 /* We have found a nonprovisional discard. */
2727 register lin j;
2728 lin length;
2729 lin provisional = 0;
2730
2731 /* Find end of this run of discardable lines. Count
2732 how many are provisionally discardable. */
2733 for (j = i; j < end; j++) {
2734 if (discards[j] == 0)
2735 break;
2736 if (discards[j] == 2)
2737 ++provisional;
2738 }
2739
2740 /* Cancel provisional discards at end, and shrink the
2741 run. */
2742 while (j > i && discards[j - 1] == 2)
2743 discards[--j] = 0, --provisional;
2744
2745 /* Now we have the length of a run of discardable
2746 lines whose first and last are not provisional. */
2747 length = j - i;
2748
2749 /* If $1/4$ of the lines in the run are provisional,
2750 cancel discarding of all provisional lines in the
2751 run. */
2752 if (provisional * 4 > length) {
2753 while (j > i)
2754 if (discards[--j] == 2)
2755 discards[j] = 0;
2756 } else {
2757 register lin consec;
2758 lin minimum = 1;
2759 lin tem = length >> 2;
2760
2761 /* |minimum| is approximate square root of
2762 |length/4|. A subrun of two or more
2763 provisionals can stand when |length| is at
2764 least 16. A subrun of 4 or more can stand when
2765 |LENGTH >= 64|. */
2766 while (0 < (tem >>= 2))
2767 minimum <<= 1;
2768 minimum++;
2769
2770 /* Cancel any subrun of |minimum| or more
2771 provisionals within the larger run. */
2772 for (j = 0, consec = 0; j < length; j++)
2773 if (discards[i + j] != 2)
2774 consec = 0;
2775 else if (minimum == ++consec)
2776 /* Back up to start of subrun, to cancel
2777 it all. */
2778 j -= consec;
2779 else if (minimum < consec)
2780 discards[i + j] = 0;
2781
2782 /* Scan from beginning of run until we find 3 or
2783 more nonprovisionals in a row or until the
2784 first nonprovisional at least 8 lines in.
2785 Until that point, cancel any provisionals. */
2786 for (j = 0, consec = 0; j < length; j++) {
2787 if (j >= 8 && discards[i + j] == 1)
2788 break;
2789 if (discards[i + j] == 2)
2790 consec = 0, discards[i + j] = 0;
2791 else if (discards[i + j] == 0)
2792 consec = 0;
2793 else
2794 consec++;
2795 if (consec == 3)
2796 break;
2797 }
2798
2799 /* I advances to the last line of the run. */
2800 i += length - 1;
2801
2802 /* Same thing, from end. */
2803 for (j = 0, consec = 0; j < length; j++) {
2804 if (j >= 8 && discards[i - j] == 1)
2805 break;
2806 if (discards[i - j] == 2)
2807 consec = 0, discards[i - j] = 0;
2808 else if (discards[i - j] == 0)
2809 consec = 0;
2810 else
2811 consec++;
2812 if (consec == 3)
2813 break;
2814 }
2815 }
2816 }
2817 }
2818 }
2819
2820 /* Actually discard the lines. */
2821 for (f = 0; f < 2; f++) {
2822 char *discards = discarded[f];
2823 lin end = filevec[f].buffered_lines;
2824 lin j = 0;
2825 for (i = 0; i < end; ++i)
2826 if (minimal || discards[i] == 0) {
2827 filevec[f].undiscarded[j] = filevec[f].equivs[i];
2828 filevec[f].realindexes[j++] = i;
2829 } else
2830 filevec[f].changed[i] = 1;
2831 filevec[f].nondiscarded_lines = j;
2832 }
2833
2834 free(discarded[0]);
2835 free(equiv_count[0]);
2836 }
2837
2838 /* Adjust inserts/deletes of identical lines to join changes as much
2839 as possible.
2840
2841 We do something when a run of changed lines include a line at one
2842 end and have an excluded, identical line at the other. We are free
2843 to choose which identical line is included. |compareseq| usually
2844 chooses the one at the beginning, but usually it is cleaner to
2845 consider the following identical line to be the "change". */
2846
2847 static void
2848 shift_boundaries(struct file_data filevec[]) {
2849 int f;
2850
2851 for (f = 0; f < 2; f++) {
2852 char *changed = filevec[f].changed;
2853 char *other_changed = filevec[1 - f].changed;
2854 lin const *equivs = filevec[f].equivs;
2855 lin i = 0;
2856 lin j = 0;
2857 lin i_end = filevec[f].buffered_lines;
2858
2859 while (1) {
2860 lin runlength, start, corresponding;
2861
2862 /* Scan forwards to find beginning of another run of
2863 changes. Also keep track of the corresponding point in
2864 the other file. */
2865
2866 while (i < i_end && !changed[i]) {
2867 while (other_changed[j++])
2868 continue;
2869 i++;
2870 }
2871
2872 if (i == i_end)
2873 break;
2874
2875 start = i;
2876
2877 /* Find the end of this run of changes. */
2878
2879 while (changed[++i])
2880 continue;
2881 while (other_changed[j])
2882 j++;
2883
2884 do {
2885 /* Record the length of this run of changes, so that
2886 we can later determine whether the run has grown. */
2887 runlength = i - start;
2888
2889 /* Move the changed region back, so long as the
2890 previous unchanged line matches the last changed one.
2891 This merges with previous changed regions. */
2892
2893 while (start && equivs[start - 1] == equivs[i - 1]) {
2894 changed[--start] = 1;
2895 changed[--i] = 0;
2896 while (changed[start - 1])
2897 start--;
2898 while (other_changed[--j])
2899 continue;
2900 }
2901
2902 /* Set |corresponding| to the end of the changed run,
2903 at the last point where it corresponds to a changed run
2904 in the other file. |corresponding == i_end| means no
2905 such point has been found. */
2906 corresponding = other_changed[j - 1] ? i : i_end;
2907
2908 /* Move the changed region forward, so long as the
2909 first changed line matches the following unchanged one.
2910 This merges with following changed regions. Do this
2911 second, so that if there are no merges, the changed
2912 region is moved forward as far as possible. */
2913
2914 while (i != i_end && equivs[start] == equivs[i]) {
2915 changed[start++] = 0;
2916 changed[i++] = 1;
2917 while (changed[i])
2918 i++;
2919 while (other_changed[++j])
2920 corresponding = i;
2921 }
2922 } while (runlength != i - start);
2923
2924 /* If possible, move the fully-merged run of changes back
2925 to a corresponding run in the other file. */
2926
2927 while (corresponding < i) {
2928 changed[--start] = 1;
2929 changed[--i] = 0;
2930 while (other_changed[--j])
2931 continue;
2932 }
2933 }
2934 }
2935 }
2936
2937 /* Cons an additional entry onto the front of an edit script |old|.
2938 |line0| and |line1| are the first affected lines in the two files
2939 (origin 0). |deleted| is the number of lines deleted here from
2940 file 0. |inserted| is the number of lines inserted here in file 1.
2941
2942 If |deleted| is 0 then |line0| is the number of the line before
2943 which the insertion was done; vice versa for |inserted| and
2944 |line1|. */
2945
2946 static struct change *
2947 add_change(lin line0, lin line1, lin deleted, lin inserted,
2948 struct change *old) {
2949 struct change *new = xmalloc(sizeof *new);
2950
2951 new->line0 = line0;
2952 new->line1 = line1;
2953 new->inserted = inserted;
2954 new->deleted = deleted;
2955 new->link = old;
2956 return new;
2957 }
2958
2959 /* Scan the tables of which lines are inserted and deleted, producing
2960 an edit script in reverse order. */
2961
2962 static struct change *
2963 build_reverse_script(struct file_data const filevec[]) {
2964 struct change *script = 0;
2965 char *changed0 = filevec[0].changed;
2966 char *changed1 = filevec[1].changed;
2967 lin len0 = filevec[0].buffered_lines;
2968 lin len1 = filevec[1].buffered_lines;
2969
2970 /* Note that |changedN[lenN]| does exist, and is 0. */
2971
2972 lin i0 = 0, i1 = 0;
2973
2974 while (i0 < len0 || i1 < len1) {
2975 if (changed0[i0] | changed1[i1]) {
2976 lin line0 = i0, line1 = i1;
2977
2978 /* Find \# lines changed here in each file. */
2979 while (changed0[i0]) ++i0;
2980 while (changed1[i1]) ++i1;
2981
2982 /* Record this change. */
2983 script = add_change(line0, line1, i0 - line0, i1 - line1, script);
2984 }
2985
2986 /* We have reached lines in the two files that match each
2987 other. */
2988 i0++, i1++;
2989 }
2990
2991 return script;
2992 }
2993
2994 /* Scan the tables of which lines are inserted and deleted, producing
2995 an edit script in forward order. */
2996
2997 static struct change *
2998 build_script(struct file_data const filevec[]) {
2999 struct change *script = 0;
3000 char *changed0 = filevec[0].changed;
3001 char *changed1 = filevec[1].changed;
3002 lin i0 = filevec[0].buffered_lines, i1 = filevec[1].buffered_lines;
3003
3004 /* Note that |changedN[-1]| does exist, and is 0. */
3005
3006 while (i0 >= 0 || i1 >= 0) {
3007 if (changed0[i0 - 1] | changed1[i1 - 1]) {
3008 lin line0 = i0, line1 = i1;
3009
3010 /* Find \# lines changed here in each file. */
3011 while (changed0[i0 - 1]) --i0;
3012 while (changed1[i1 - 1]) --i1;
3013
3014 /* Record this change. */
3015 script = add_change(i0, i1, line0 - i0, line1 - i1, script);
3016 }
3017
3018 /* We have reached lines in the two files that match each
3019 other. */
3020 i0--, i1--;
3021 }
3022
3023 return script;
3024 }
3025
3026 /* If |changes|, briefly report that two files differed. */
3027 static void
3028 briefly_report(int changes, struct file_data const filevec[]) {
3029 if (changes)
3030 message((brief
3031 ? "Files %s and %s differ\n"
3032 : "Binary files %s and %s differ\n"),
3033 file_label[0] ? file_label[0] : filevec[0].name,
3034 file_label[1] ? file_label[1] : filevec[1].name);
3035 }
3036
3037 /* Report the differences of two files. */
3038 int
3039 diff_2_files(struct comparison *cmp) {
3040 int f;
3041 struct change *e, *p;
3042 struct change *script;
3043 int changes;
3044
3045
3046 /* If we have detected that either file is binary, compare the two
3047 files as binary. This can happen only when the first chunk is
3048 read. */
3049
3050 if (read_files(cmp->file, files_can_be_treated_as_binary)) {
3051 /* Files with different lengths must be different. */
3052 if (cmp->file[0].stat.st_size != cmp->file[1].stat.st_size
3053 && 0 < cmp->file[0].stat.st_size
3054 && 0 < cmp->file[1].stat.st_size
3055 && (cmp->file[0].desc < 0 || S_ISREG (cmp->file[0].stat.st_mode))
3056 && (cmp->file[1].desc < 0 || S_ISREG (cmp->file[1].stat.st_mode)))
3057 changes = 1;
3058
3059 /* Standard input equals itself. */
3060 else if (cmp->file[0].desc == cmp->file[1].desc)
3061 changes = 0;
3062
3063 else
3064 /* Scan both files, a buffer at a time, looking for a
3065 difference. */
3066 {
3067 /* Allocate same-sized buffers for both files. */
3068 size_t lcm_max = PTRDIFF_MAX - 1;
3069 size_t buffer_size =
3070 buffer_lcm(sizeof(word),
3071 buffer_lcm(STAT_BLOCKSIZE (cmp->file[0].stat),
3072 STAT_BLOCKSIZE (cmp->file[1].stat),
3073 lcm_max),
3074 lcm_max);
3075 for (f = 0; f < 2; f++)
3076 cmp->file[f].buffer = xrealloc(cmp->file[f].buffer, buffer_size);
3077
3078 for (;; cmp->file[0].buffered = cmp->file[1].buffered = 0) {
3079 /* Read a buffer's worth from both files. */
3080 for (f = 0; f < 2; f++)
3081 if (0 <= cmp->file[f].desc)
3082 file_block_read(&cmp->file[f],
3083 buffer_size - cmp->file[f].buffered);
3084
3085 /* If the buffers differ, the files differ. */
3086 if (cmp->file[0].buffered != cmp->file[1].buffered
3087 || memcmp(cmp->file[0].buffer,
3088 cmp->file[1].buffer,
3089 cmp->file[0].buffered)) {
3090 changes = 1;
3091 break;
3092 }
3093
3094 /* If we reach end of file, the files are the
3095 same. */
3096 if (cmp->file[0].buffered != buffer_size) {
3097 changes = 0;
3098 break;
3099 }
3100 }
3101 }
3102
3103 briefly_report(changes, cmp->file);
3104 } else {
3105 struct context ctxt;
3106 lin diags;
3107 lin too_expensive;
3108
3109 /* Allocate vectors for the results of comparison: a flag for
3110 each line of each file, saying whether that line is an
3111 insertion or deletion. Allocate an extra element, always 0, at
3112 each end of each vector. */
3113
3114 size_t s = cmp->file[0].buffered_lines + cmp->file[1].buffered_lines + 4;
3115 char *flag_space = zalloc(s);
3116 cmp->file[0].changed = flag_space + 1;
3117 cmp->file[1].changed = flag_space + cmp->file[0].buffered_lines + 3;
3118
3119 /* Some lines are obviously insertions or deletions because
3120 they don't match anything. Detect them now, and avoid even
3121 thinking about them in the main comparison algorithm. */
3122
3123 discard_confusing_lines(cmp->file);
3124
3125 /* Now do the main comparison algorithm, considering just the
3126 undiscarded lines. */
3127
3128 ctxt.xvec = cmp->file[0].undiscarded;
3129 ctxt.yvec = cmp->file[1].undiscarded;
3130 diags = (cmp->file[0].nondiscarded_lines
3131 + cmp->file[1].nondiscarded_lines + 3);
3132 ctxt.fdiag = xmalloc(diags * (2 * sizeof *ctxt.fdiag));
3133 ctxt.bdiag = ctxt.fdiag + diags;
3134 ctxt.fdiag += cmp->file[1].nondiscarded_lines + 1;
3135 ctxt.bdiag += cmp->file[1].nondiscarded_lines + 1;
3136
3137 ctxt.heuristic = speed_large_files;
3138
3139 /* Set |too_expensive| to be the approximate square root of
3140 the input size, bounded below by 4096. 4096 seems to be good
3141 for circa-2016 CPUs; see |Bug#16848| and |Bug#24715|. */
3142 too_expensive = 1;
3143 for (; diags != 0; diags >>= 2)
3144 too_expensive <<= 1;
3145 ctxt.too_expensive = MAX (4096, too_expensive);
3146
3147 files[0] = cmp->file[0];
3148 files[1] = cmp->file[1];
3149
3150 compareseq(0, cmp->file[0].nondiscarded_lines,
3151 0, cmp->file[1].nondiscarded_lines, minimal, &ctxt);
3152
3153 free(ctxt.fdiag - (cmp->file[1].nondiscarded_lines + 1));
3154
3155 /* Modify the results slightly to make them prettier in cases
3156 where that can validly be done. */
3157
3158 shift_boundaries(cmp->file);
3159
3160 /* Get the results of comparison in the form of a chain of
3161 |struct change|s -- an edit script. */
3162
3163 script = build_script(cmp->file);
3164
3165 /* Set |changes| if we had any diffs. If some changes are
3166 ignored, we must scan the script to decide. */
3167 changes = (script != 0);
3168
3169 if (brief)
3170 briefly_report(changes, cmp->file);
3171 else {
3172 if (changes) {
3173 /* Record info for starting up output, to be used if
3174 and when we have some output to print. */
3175 setup_output(file_label[0] ? file_label[0] : cmp->file[0].name,
3176 file_label[1] ? file_label[1] : cmp->file[1].name,
3177 cmp->parent != 0);
3178 print_context_script(script);
3179 finish_output();
3180 }
3181 }
3182
3183 free(cmp->file[0].undiscarded);
3184
3185 free(flag_space);
3186
3187 for (f = 0; f < 2; f++) {
3188 free(cmp->file[f].equivs);
3189 free(cmp->file[f].linbuf + cmp->file[f].linbuf_base);
3190 }
3191
3192 for (e = script; e; e = p) {
3193 p = e->link;
3194 free(e);
3195 }
3196
3197 for (f = 0; f < 2; ++f)
3198 if (cmp->file[f].missing_newline) {
3199 error(0, 0, "%s: %s\n",
3200 file_label[f] ? file_label[f] : cmp->file[f].name,
3201 "No newline at end of file");
3202 changes = 2;
3203 }
3204 }
3205
3206 if (cmp->file[0].buffer != cmp->file[1].buffer)
3207 free(cmp->file[0].buffer);
3208 free(cmp->file[1].buffer);
3209
3210 return changes;
3211 }