-
+ 9AB0950EFF37136222DC461537F8448EBC36DFF595E77AF294E48F9142657ED6DA78D72C232ECC890E28BA75A2160D7AB3A09E1A7AA61C50AC95C7A747EF2B7A
vtools/src/io.c
(0 . 0)(1 . 596)
4364
4365 #include "diff.h"
4366 #include <cmpbuf.h>
4367 #include <xalloc.h>
4368 #include <string.h>
4369 #include <limits.h>
4370 #include <stdlib.h>
4371
4372 /* Rotate an unsigned value to the left. */
4373 #define ROL(v, n) ((v) << (n) | (v) >> (sizeof (v) * CHAR_BIT - (n)))
4374
4375 /* Given a hash value and a new character, return a new hash
4376 value. */
4377 #define HASH(h, c) ((c) + ROL (h, 7))
4378
4379 /* The type of a hash value. */
4380 typedef size_t hash_value;
4381
4382 /* Lines are put into equivalence classes of lines that match in
4383 |lines_differ|. Each equivalence class is represented by one of
4384 these structures, but only while the classes are being computed.
4385 Afterward, each class is represented by a number. */
4386 struct equivclass {
4387 lin next; /* Next item in this bucket. */
4388 hash_value hash; /* Hash of lines in this class. */
4389 char const *line; /* A line that fits this class. */
4390 size_t length; /* That line's length, not counting its newline. */
4391 };
4392
4393 /* Hash-table: array of buckets, each being a chain of equivalence
4394 classes. |buckets[-1]| is reserved for incomplete lines. */
4395 static lin *buckets;
4396
4397 /* Number of buckets in the hash table array, not counting
4398 |buckets[-1]|. */
4399 static size_t nbuckets;
4400
4401 /* Array in which the equivalence classes are allocated. The
4402 bucket-chains go through the elements in this array. The number of
4403 an equivalence class is its index in this array. */
4404 static struct equivclass *equivs;
4405
4406 /* Index of first free element in the array |equivs|. */
4407 static lin equivs_index;
4408
4409 /* Number of elements allocated in the array |equivs|. */
4410 static lin equivs_alloc;
4411
4412 /* Read a block of data into a file buffer, checking for EOF and
4413 error. */
4414
4415 void
4416 file_block_read(struct file_data *current, size_t size) {
4417 if (size && !current->eof) {
4418 size_t s = block_read(current->desc,
4419 FILE_BUFFER (current) + current->buffered, size);
4420 if (s == SIZE_MAX)
4421 pfatal_with_name(current->name);
4422
4423 current->buffered += s;
4424 current->eof = s < size;
4425 }
4426 }
4427
4428 /* Check for binary files and compare them for exact identity. */
4429
4430 /* Return 1 if |buf| contains a non text character. |size| is the
4431 number of characters in |buf|. */
4432
4433 #define binary_file_p(buf, size) (memchr (buf, 0, size) != 0)
4434
4435 /* Get ready to read the current file. Return nonzero if |skip_test|
4436 is zero, and if it appears to be a binary file. */
4437
4438 static bool
4439 sip(struct file_data *current, bool skip_test) {
4440 /* If we have a nonexistent file at this stage, treat it as
4441 empty. */
4442 if (current->desc < 0) {
4443 /* Leave room for a sentinel. */
4444 current->bufsize = sizeof(word);
4445 current->buffer = xmalloc(current->bufsize);
4446 } else {
4447 current->bufsize = buffer_lcm(sizeof(word),
4448 STAT_BLOCKSIZE (current->stat),
4449 PTRDIFF_MAX - 2 * sizeof(word));
4450 current->buffer = xmalloc(current->bufsize);
4451
4452 if (!skip_test) {
4453 /* Check first part of file to see if it's a binary file. */
4454
4455 off_t buffered;
4456 file_block_read(current, current->bufsize);
4457 buffered = current->buffered;
4458 return binary_file_p (current->buffer, buffered);
4459 }
4460 }
4461
4462 current->buffered = 0;
4463 current->eof = false;
4464 return false;
4465 }
4466
4467 /* Slurp the rest of the current file completely into memory. */
4468
4469 static void
4470 slurp(struct file_data *current) {
4471 size_t cc;
4472
4473 if (current->desc < 0) {
4474 /* The file is nonexistent. */
4475 return;
4476 }
4477
4478 if (S_ISREG (current->stat.st_mode)) {
4479 /* It's a regular file; slurp in the rest all at once. */
4480
4481 /* Get the size out of the stat block. Allocate just enough
4482 room for appended newline plus word sentinel, plus
4483 word-alignment since we want the buffer word-aligned. */
4484 size_t file_size = current->stat.st_size;
4485 cc = file_size + 2 * sizeof(word) - file_size % sizeof(word);
4486 if (file_size != current->stat.st_size || cc < file_size
4487 || PTRDIFF_MAX <= cc)
4488 xalloc_die();
4489
4490 if (current->bufsize < cc) {
4491 current->bufsize = cc;
4492 current->buffer = xrealloc(current->buffer, cc);
4493 }
4494
4495 /* Try to read at least 1 more byte than the size indicates,
4496 to detect whether the file is growing. This is a nicety for
4497 users who run 'diff' on files while they are changing. */
4498
4499 if (current->buffered <= file_size) {
4500 file_block_read(current, file_size + 1 - current->buffered);
4501 if (current->buffered <= file_size)
4502 return;
4503 }
4504 }
4505
4506 /* It's not a regular file, or it's a growing regular file; read
4507 it, growing the buffer as needed. */
4508
4509 file_block_read(current, current->bufsize - current->buffered);
4510
4511 if (current->buffered) {
4512 while (current->buffered == current->bufsize) {
4513 if (PTRDIFF_MAX / 2 - sizeof(word) < current->bufsize)
4514 xalloc_die();
4515 current->bufsize *= 2;
4516 current->buffer = xrealloc(current->buffer, current->bufsize);
4517 file_block_read(current, current->bufsize - current->buffered);
4518 }
4519
4520 /* Allocate just enough room for appended newline plus word
4521 sentinel, plus word-alignment. */
4522 cc = current->buffered + 2 * sizeof(word);
4523 current->bufsize = cc - cc % sizeof(word);
4524 current->buffer = xrealloc(current->buffer, current->bufsize);
4525 }
4526 }
4527
4528 /* Split the file into lines, simultaneously computing the equivalence
4529 class for each line. */
4530
4531 static void
4532 find_and_hash_each_line(struct file_data *current) {
4533 char const *p = current->prefix_end;
4534 lin i, *bucket;
4535 size_t length;
4536
4537 /* Cache often-used quantities in local variables to help the compiler. */
4538 char const **linbuf = current->linbuf;
4539 lin alloc_lines = current->alloc_lines;
4540 lin line = 0;
4541 lin linbuf_base = current->linbuf_base;
4542 lin *cureqs = xmalloc(alloc_lines * sizeof *cureqs);
4543 struct equivclass *eqs = equivs;
4544 lin eqs_index = equivs_index;
4545 lin eqs_alloc = equivs_alloc;
4546 char const *suffix_begin = current->suffix_begin;
4547 char const *bufend = FILE_BUFFER (current) + current->buffered;
4548
4549
4550 while (p < suffix_begin) {
4551 char const *ip = p;
4552 hash_value h = 0;
4553 unsigned char c;
4554
4555 /* Hash this line until we find a newline. */
4556 while ((c = *p++) != '\n')
4557 h = HASH (h, c);
4558
4559 bucket = &buckets[h % nbuckets];
4560 length = p - ip - 1;
4561
4562 for (i = *bucket;; i = eqs[i].next)
4563 if (!i) {
4564 /* Create a new equivalence class in this bucket. */
4565 i = eqs_index++;
4566 if (i == eqs_alloc) {
4567 if (PTRDIFF_MAX / (2 * sizeof *eqs) <= eqs_alloc)
4568 xalloc_die();
4569 eqs_alloc *= 2;
4570 eqs = xrealloc(eqs, eqs_alloc * sizeof *eqs);
4571 }
4572 eqs[i].next = *bucket;
4573 eqs[i].hash = h;
4574 eqs[i].line = ip;
4575 eqs[i].length = length;
4576 *bucket = i;
4577 break;
4578 } else if (eqs[i].hash == h) {
4579 char const *eqline = eqs[i].line;
4580
4581 /* Reuse existing class if |lines_differ| reports the
4582 lines equal. */
4583 if (eqs[i].length == length) {
4584 /* Reuse existing equivalence class if the lines
4585 are identical. This detects the common case of
4586 exact identity faster than |lines_differ|
4587 would. */
4588 if (memcmp(eqline, ip, length) == 0)
4589 break;
4590 continue;
4591 } else
4592 continue;
4593 }
4594
4595 /* Maybe increase the size of the line table. */
4596 if (line == alloc_lines) {
4597 /* Double |(alloc_lines - linbuf_base)| by adding to
4598 |alloc_lines|. */
4599 if (PTRDIFF_MAX / 3 <= alloc_lines
4600 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
4601 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
4602 xalloc_die();
4603 alloc_lines = 2 * alloc_lines - linbuf_base;
4604 cureqs = xrealloc(cureqs, alloc_lines * sizeof *cureqs);
4605 linbuf += linbuf_base;
4606 linbuf = xrealloc(linbuf,
4607 (alloc_lines - linbuf_base) * sizeof *linbuf);
4608 linbuf -= linbuf_base;
4609 }
4610 linbuf[line] = ip;
4611 cureqs[line] = i;
4612 ++line;
4613 }
4614
4615 current->buffered_lines = line;
4616
4617 for (i = 0;; i++) {
4618 /* Record the line start for lines in the suffix that we care
4619 about. Record one more line start than lines, so that we can
4620 compute the length of any buffered line. */
4621 if (line == alloc_lines) {
4622 /* Double |(alloc_lines - linbuf_base)| by adding to
4623 |alloc_lines|. */
4624 if (PTRDIFF_MAX / 3 <= alloc_lines
4625 || PTRDIFF_MAX / sizeof *cureqs <= 2 * alloc_lines - linbuf_base
4626 || PTRDIFF_MAX / sizeof *linbuf <= alloc_lines - linbuf_base)
4627 xalloc_die();
4628 alloc_lines = 2 * alloc_lines - linbuf_base;
4629 linbuf += linbuf_base;
4630 linbuf = xrealloc(linbuf,
4631 (alloc_lines - linbuf_base) * sizeof *linbuf);
4632 linbuf -= linbuf_base;
4633 }
4634 linbuf[line] = p;
4635
4636 if (p == bufend) {
4637 /* If the last line is incomplete and we do not silently
4638 complete lines, don't count its appended newline. */
4639 if (current->missing_newline && false) /* our output style not robust */
4640 linbuf[line]--;
4641 break;
4642 }
4643
4644 line++;
4645
4646 while (*p++ != '\n')
4647 continue;
4648 }
4649
4650 /* Done with cache in local variables. */
4651 current->linbuf = linbuf;
4652 current->valid_lines = line;
4653 current->alloc_lines = alloc_lines;
4654 current->equivs = cureqs;
4655 equivs = eqs;
4656 equivs_alloc = eqs_alloc;
4657 equivs_index = eqs_index;
4658 }
4659
4660 /* Prepare the text. Make sure the text end is initialized. Make
4661 sure text ends in a newline, but remember that we had to add one.
4662 Strip trailing CRs, if that was requested. */
4663
4664 static void
4665 prepare_text(struct file_data *current) {
4666 size_t buffered = current->buffered;
4667 char *p = FILE_BUFFER (current);
4668
4669 if (buffered == 0 || p[buffered - 1] == '\n')
4670 current->missing_newline = false;
4671 else {
4672 p[buffered++] = '\n';
4673 current->missing_newline = true;
4674 }
4675
4676 if (!p)
4677 return;
4678
4679 /* Don't use uninitialized storage when planting or using sentinels. */
4680 memset (p + buffered, 0, sizeof(word));
4681
4682 current->buffered = buffered;
4683 }
4684
4685 /* We have found |n| lines in a buffer of size |s|; guess the
4686 proportionate number of lines that will be found in a buffer of
4687 size |t|. However, do not guess a number of lines so large that
4688 the resulting line table might cause overflow in size
4689 calculations. */
4690 static lin
4691 guess_lines(lin n, size_t s, size_t t) {
4692 size_t guessed_bytes_per_line = n < 10 ? 32 : s / (n - 1);
4693 lin guessed_lines = MAX (1, t / guessed_bytes_per_line);
4694 return MIN (guessed_lines, PTRDIFF_MAX / (2 * sizeof(char *) + 1) - 5) + 5;
4695 }
4696
4697 /* Given a vector of two |file_data| objects, find the identical
4698 prefixes and suffixes of each object. */
4699
4700 static void
4701 find_identical_ends(struct file_data filevec[]) {
4702 word *w0, *w1;
4703 char *p0, *p1, *buffer0, *buffer1;
4704 char const *end0, *beg0;
4705 char const **linbuf0, **linbuf1;
4706 lin i, lines;
4707 size_t n0, n1;
4708 lin alloc_lines0, alloc_lines1;
4709 lin buffered_prefix, prefix_count, prefix_mask;
4710 lin middle_guess, suffix_guess;
4711
4712 slurp(&filevec[0]);
4713 prepare_text(&filevec[0]);
4714 if (filevec[0].desc != filevec[1].desc) {
4715 slurp(&filevec[1]);
4716 prepare_text(&filevec[1]);
4717 } else {
4718 filevec[1].buffer = filevec[0].buffer;
4719 filevec[1].bufsize = filevec[0].bufsize;
4720 filevec[1].buffered = filevec[0].buffered;
4721 filevec[1].missing_newline = filevec[0].missing_newline;
4722 }
4723
4724 /* Find identical prefix. */
4725
4726 w0 = filevec[0].buffer;
4727 w1 = filevec[1].buffer;
4728 p0 = buffer0 = (char *) w0;
4729 p1 = buffer1 = (char *) w1;
4730 n0 = filevec[0].buffered;
4731 n1 = filevec[1].buffered;
4732
4733 if (p0 == p1)
4734 /* The buffers are the same; sentinels won't work. */
4735 p0 = p1 += n1;
4736 else {
4737 /* Insert end sentinels, in this case characters that are
4738 guaranteed to make the equality test false, and thus terminate
4739 the loop. */
4740
4741 if (n0 < n1)
4742 p0[n0] = ~p1[n0];
4743 else
4744 p1[n1] = ~p0[n1];
4745
4746 /* Loop until first mismatch, or to the sentinel
4747 characters. */
4748
4749 /* Compare a word at a time for speed. */
4750 while (*w0 == *w1)
4751 w0++, w1++;
4752
4753 /* Do the last few bytes of comparison a byte at a time. */
4754 p0 = (char *) w0;
4755 p1 = (char *) w1;
4756 while (*p0 == *p1)
4757 p0++, p1++;
4758
4759 /* Don't mistakenly count missing newline as part of
4760 prefix. */
4761 if (false
4762 && ((buffer0 + n0 - filevec[0].missing_newline < p0)
4763 !=
4764 (buffer1 + n1 - filevec[1].missing_newline < p1)))
4765 p0--, p1--;
4766 }
4767
4768 /* Now |p0| and |p1| point at the first nonmatching
4769 characters. */
4770
4771 /* Skip back to last line-beginning in the prefix, and then
4772 discard up to |horizon_lines| lines from the prefix. */
4773 i = horizon_lines;
4774 while (p0 != buffer0 && (p0[-1] != '\n' || i--))
4775 p0--, p1--;
4776
4777 /* Record the prefix. */
4778 filevec[0].prefix_end = p0;
4779 filevec[1].prefix_end = p1;
4780
4781 /* Find identical suffix. */
4782
4783 /* |p0| and |p1| point beyond the last chars not yet compared. */
4784 p0 = buffer0 + n0;
4785 p1 = buffer1 + n1;
4786
4787 if (filevec[0].missing_newline == filevec[1].missing_newline) {
4788 end0 = p0; /* Addr of last char in file 0. */
4789
4790 /* Get value of |p0| at which we should stop scanning
4791 backward: this is when either |p0| or |p1| points just past the
4792 last char of the identical prefix. */
4793 beg0 = filevec[0].prefix_end + (n0 < n1 ? 0 : n0 - n1);
4794
4795 /* Scan back until chars don't match or we reach that point. */
4796 while (p0 != beg0)
4797 if (*--p0 != *--p1) {
4798 /* Point at the first char of the matching suffix. */
4799 ++p0, ++p1;
4800 beg0 = p0;
4801 break;
4802 }
4803
4804 /* Are we at a line-beginning in both files? If not, add the
4805 rest of this line to the main body. Discard up to
4806 |horizon_lines| lines from the identical suffix. Also, discard
4807 one extra line, because |shift_boundaries| may need it. */
4808 i = horizon_lines + !((buffer0 == p0 || p0[-1] == '\n')
4809 &&
4810 (buffer1 == p1 || p1[-1] == '\n'));
4811 while (i-- && p0 != end0)
4812 while (*p0++ != '\n')
4813 continue;
4814
4815 p1 += p0 - beg0;
4816 }
4817
4818 /* Record the suffix. */
4819 filevec[0].suffix_begin = p0;
4820 filevec[1].suffix_begin = p1;
4821
4822 /* Calculate number of lines of prefix to save.
4823
4824 |prefix_count == 0| means save the whole prefix; we need this
4825 for options like |-D| that output the whole file, or for
4826 enormous contexts (to avoid worrying about arithmetic
4827 overflow). We also need it for options like |-F| that output
4828 some preceding line; at least we will need to find the last few
4829 lines, but since we don't know how many, it's easiest to find
4830 them all.
4831
4832 Otherwise, |prefix_count != 0|. Save just |prefix_count| lines
4833 at start of the line buffer; they'll be moved to the proper
4834 location later. Handle 1 more line than the context says
4835 (because we count 1 too many), rounded up to the next power of
4836 2 to speed index computation. */
4837
4838 prefix_count = 0;
4839 alloc_lines0 = guess_lines(0, 0, n0);
4840 prefix_mask = prefix_count - 1;
4841 lines = 0;
4842 linbuf0 = xmalloc(alloc_lines0 * sizeof *linbuf0);
4843 p0 = buffer0;
4844
4845 /* If the prefix is needed, find the prefix lines. */
4846 end0 = filevec[0].prefix_end;
4847 while (p0 != end0) {
4848 lin l = lines++ & prefix_mask;
4849 if (l == alloc_lines0) {
4850 if (PTRDIFF_MAX / (2 * sizeof *linbuf0) <= alloc_lines0)
4851 xalloc_die();
4852 alloc_lines0 *= 2;
4853 linbuf0 = xrealloc(linbuf0, alloc_lines0 * sizeof *linbuf0);
4854 }
4855 linbuf0[l] = p0;
4856 while (*p0++ != '\n')
4857 continue;
4858 }
4859 buffered_prefix = prefix_count && context < lines ? context : lines;
4860
4861 /* Allocate line buffer 1. */
4862
4863 middle_guess = guess_lines(lines, p0 - buffer0, p1 - filevec[1].prefix_end);
4864 suffix_guess = guess_lines(lines, p0 - buffer0, buffer1 + n1 - p1);
4865 alloc_lines1 = buffered_prefix + middle_guess + MIN (context, suffix_guess);
4866 if (alloc_lines1 < buffered_prefix
4867 || PTRDIFF_MAX / sizeof *linbuf1 <= alloc_lines1)
4868 xalloc_die();
4869 linbuf1 = xmalloc(alloc_lines1 * sizeof *linbuf1);
4870
4871 if (buffered_prefix != lines) {
4872 /* Rotate prefix lines to proper location. */
4873 for (i = 0; i < buffered_prefix; i++)
4874 linbuf1[i] = linbuf0[(lines - context + i) & prefix_mask];
4875 for (i = 0; i < buffered_prefix; i++)
4876 linbuf0[i] = linbuf1[i];
4877 }
4878
4879 /* Initialize line buffer 1 from line buffer 0. */
4880 for (i = 0; i < buffered_prefix; i++)
4881 linbuf1[i] = linbuf0[i] - buffer0 + buffer1;
4882
4883 /* Record the line buffer, adjusted so that
4884 |linbuf[0]| points at the first differing line. */
4885 filevec[0].linbuf = linbuf0 + buffered_prefix;
4886 filevec[1].linbuf = linbuf1 + buffered_prefix;
4887 filevec[0].linbuf_base = filevec[1].linbuf_base = -buffered_prefix;
4888 filevec[0].alloc_lines = alloc_lines0 - buffered_prefix;
4889 filevec[1].alloc_lines = alloc_lines1 - buffered_prefix;
4890 filevec[0].prefix_lines = filevec[1].prefix_lines = lines;
4891 }
4892
4893 /* If $1 < k$, then $(2^k - {\rm prime\_offset}_k)$ is the largest
4894 prime less than $2^k$. This table is derived from
4895 Chris~K.~Caldwell's list
4896 \url{http://www.utm.edu/research/primes/lists/2small/}. */
4897
4898 static unsigned char const prime_offset[] =
4899 {
4900 0, 0, 1, 1, 3, 1, 3, 1, 5, 3, 3, 9, 3, 1, 3, 19, 15, 1, 5, 1, 3, 9, 3,
4901 15, 3, 39, 5, 39, 57, 3, 35, 1, 5, 9, 41, 31, 5, 25, 45, 7, 87, 21,
4902 11, 57, 17, 55, 21, 115, 59, 81, 27, 129, 47, 111, 33, 55, 5, 13, 27,
4903 55, 93, 1, 57, 25
4904 };
4905
4906 /* Given a vector of two |file_data| objects, read the file associated
4907 with each one, and build the table of equivalence classes. Return
4908 nonzero if either file appears to be a binary file. If
4909 |pretend_binary| is nonzero, pretend they are binary
4910 regardless. */
4911
4912 bool
4913 read_files(struct file_data filevec[], bool pretend_binary) {
4914 int i;
4915 bool skip_test = text | pretend_binary;
4916 bool appears_binary = pretend_binary | sip(&filevec[0], skip_test);
4917
4918 if (filevec[0].desc != filevec[1].desc)
4919 appears_binary |= sip(&filevec[1], skip_test | appears_binary);
4920 else {
4921 filevec[1].buffer = filevec[0].buffer;
4922 filevec[1].bufsize = filevec[0].bufsize;
4923 filevec[1].buffered = filevec[0].buffered;
4924 }
4925 if (appears_binary) {
4926 return true;
4927 }
4928
4929 find_identical_ends(filevec);
4930
4931 equivs_alloc = filevec[0].alloc_lines + filevec[1].alloc_lines + 1;
4932 if (PTRDIFF_MAX / sizeof *equivs <= equivs_alloc)
4933 xalloc_die();
4934 equivs = xmalloc(equivs_alloc * sizeof *equivs);
4935 /* Equivalence class 0 is permanently safe for lines that were not
4936 hashed. Real equivalence classes start at 1. */
4937 equivs_index = 1;
4938
4939 /* Allocate (one plus) a prime number of hash buckets. Use a
4940 prime number between $1/3$ and $2/3$ of the value of
4941 |equiv_allocs|, approximately. */
4942 for (i = 9; (size_t) 1 << i < equivs_alloc / 3; i++)
4943 continue;
4944 nbuckets = ((size_t) 1 << i) - prime_offset[i];
4945 if (PTRDIFF_MAX / sizeof *buckets <= nbuckets)
4946 xalloc_die();
4947 buckets = zalloc((nbuckets + 1) * sizeof *buckets);
4948 buckets++;
4949
4950 for (i = 0; i < 2; i++)
4951 find_and_hash_each_line(&filevec[i]);
4952
4953 filevec[0].equiv_max = filevec[1].equiv_max = equivs_index;
4954
4955 free(equivs);
4956 free(buckets - 1);
4957
4958 return false;
4959 }