1 // SPDX-License-Identifier: GPL-2.0-only
2 //
3 // Traverse the source tree, parsing all .gitignore files, and print file paths
4 // that are ignored by git.
5 // The output is suitable to the --exclude-from option of tar.
6 // This is useful until the --exclude-vcs-ignores option gets working correctly.
7 //
8 // Copyright (C) 2023 Masahiro Yamada <masahiroy@kernel.org>
9 //                      (a lot of code imported from GIT)
10 
11 #include <assert.h>
12 #include <dirent.h>
13 #include <errno.h>
14 #include <fcntl.h>
15 #include <getopt.h>
16 #include <stdarg.h>
17 #include <stdbool.h>
18 #include <stdio.h>
19 #include <stdlib.h>
20 #include <string.h>
21 #include <sys/stat.h>
22 #include <sys/types.h>
23 #include <unistd.h>
24 
25 // Imported from commit 23c56f7bd5f1667f8b793d796bf30e39545920f6 in GIT
26 //
27 //---------------------------(IMPORT FROM GIT BEGIN)---------------------------
28 
29 // Copied from environment.c
30 
31 static bool ignore_case;
32 
33 // Copied from git-compat-util.h
34 
35 /* Sane ctype - no locale, and works with signed chars */
36 #undef isascii
37 #undef isspace
38 #undef isdigit
39 #undef isalpha
40 #undef isalnum
41 #undef isprint
42 #undef islower
43 #undef isupper
44 #undef tolower
45 #undef toupper
46 #undef iscntrl
47 #undef ispunct
48 #undef isxdigit
49 
50 static const unsigned char sane_ctype[256];
51 #define GIT_SPACE 0x01
52 #define GIT_DIGIT 0x02
53 #define GIT_ALPHA 0x04
54 #define GIT_GLOB_SPECIAL 0x08
55 #define GIT_REGEX_SPECIAL 0x10
56 #define GIT_PATHSPEC_MAGIC 0x20
57 #define GIT_CNTRL 0x40
58 #define GIT_PUNCT 0x80
59 #define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0)
60 #define isascii(x) (((x) & ~0x7f) == 0)
61 #define isspace(x) sane_istest(x,GIT_SPACE)
62 #define isdigit(x) sane_istest(x,GIT_DIGIT)
63 #define isalpha(x) sane_istest(x,GIT_ALPHA)
64 #define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT)
65 #define isprint(x) ((x) >= 0x20 && (x) <= 0x7e)
66 #define islower(x) sane_iscase(x, 1)
67 #define isupper(x) sane_iscase(x, 0)
68 #define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL)
69 #define iscntrl(x) (sane_istest(x,GIT_CNTRL))
70 #define ispunct(x) sane_istest(x, GIT_PUNCT | GIT_REGEX_SPECIAL | \
71 		GIT_GLOB_SPECIAL | GIT_PATHSPEC_MAGIC)
72 #define isxdigit(x) (hexval_table[(unsigned char)(x)] != -1)
73 #define tolower(x) sane_case((unsigned char)(x), 0x20)
74 #define toupper(x) sane_case((unsigned char)(x), 0)
75 
sane_case(int x,int high)76 static inline int sane_case(int x, int high)
77 {
78 	if (sane_istest(x, GIT_ALPHA))
79 		x = (x & ~0x20) | high;
80 	return x;
81 }
82 
sane_iscase(int x,int is_lower)83 static inline int sane_iscase(int x, int is_lower)
84 {
85 	if (!sane_istest(x, GIT_ALPHA))
86 		return 0;
87 
88 	if (is_lower)
89 		return (x & 0x20) != 0;
90 	else
91 		return (x & 0x20) == 0;
92 }
93 
94 // Copied from ctype.c
95 
96 enum {
97 	S = GIT_SPACE,
98 	A = GIT_ALPHA,
99 	D = GIT_DIGIT,
100 	G = GIT_GLOB_SPECIAL,	/* *, ?, [, \\ */
101 	R = GIT_REGEX_SPECIAL,	/* $, (, ), +, ., ^, {, | */
102 	P = GIT_PATHSPEC_MAGIC, /* other non-alnum, except for ] and } */
103 	X = GIT_CNTRL,
104 	U = GIT_PUNCT,
105 	Z = GIT_CNTRL | GIT_SPACE
106 };
107 
108 static const unsigned char sane_ctype[256] = {
109 	X, X, X, X, X, X, X, X, X, Z, Z, X, X, Z, X, X,		/*   0.. 15 */
110 	X, X, X, X, X, X, X, X, X, X, X, X, X, X, X, X,		/*  16.. 31 */
111 	S, P, P, P, R, P, P, P, R, R, G, R, P, P, R, P,		/*  32.. 47 */
112 	D, D, D, D, D, D, D, D, D, D, P, P, P, P, P, G,		/*  48.. 63 */
113 	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  64.. 79 */
114 	A, A, A, A, A, A, A, A, A, A, A, G, G, U, R, P,		/*  80.. 95 */
115 	P, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A,		/*  96..111 */
116 	A, A, A, A, A, A, A, A, A, A, A, R, R, U, P, X,		/* 112..127 */
117 	/* Nothing in the 128.. range */
118 };
119 
120 // Copied from hex.c
121 
122 static const signed char hexval_table[256] = {
123 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 00-07 */
124 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 08-0f */
125 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 10-17 */
126 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 18-1f */
127 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 20-27 */
128 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 28-2f */
129 	  0,  1,  2,  3,  4,  5,  6,  7,		/* 30-37 */
130 	  8,  9, -1, -1, -1, -1, -1, -1,		/* 38-3f */
131 	 -1, 10, 11, 12, 13, 14, 15, -1,		/* 40-47 */
132 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 48-4f */
133 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 50-57 */
134 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 58-5f */
135 	 -1, 10, 11, 12, 13, 14, 15, -1,		/* 60-67 */
136 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 68-67 */
137 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 70-77 */
138 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 78-7f */
139 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 80-87 */
140 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 88-8f */
141 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 90-97 */
142 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* 98-9f */
143 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* a0-a7 */
144 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* a8-af */
145 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* b0-b7 */
146 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* b8-bf */
147 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* c0-c7 */
148 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* c8-cf */
149 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* d0-d7 */
150 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* d8-df */
151 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* e0-e7 */
152 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* e8-ef */
153 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* f0-f7 */
154 	 -1, -1, -1, -1, -1, -1, -1, -1,		/* f8-ff */
155 };
156 
157 // Copied from wildmatch.h
158 
159 #define WM_CASEFOLD 1
160 #define WM_PATHNAME 2
161 
162 #define WM_NOMATCH 1
163 #define WM_MATCH 0
164 #define WM_ABORT_ALL -1
165 #define WM_ABORT_TO_STARSTAR -2
166 
167 // Copied from wildmatch.c
168 
169 typedef unsigned char uchar;
170 
171 // local modification: remove NEGATE_CLASS(2)
172 
173 #define CC_EQ(class, len, litmatch) ((len) == sizeof (litmatch)-1 \
174 				    && *(class) == *(litmatch) \
175 				    && strncmp((char*)class, litmatch, len) == 0)
176 
177 // local modification: simpilify macros
178 #define ISBLANK(c) ((c) == ' ' || (c) == '\t')
179 #define ISGRAPH(c) (isprint(c) && !isspace(c))
180 #define ISPRINT(c) isprint(c)
181 #define ISDIGIT(c) isdigit(c)
182 #define ISALNUM(c) isalnum(c)
183 #define ISALPHA(c) isalpha(c)
184 #define ISCNTRL(c) iscntrl(c)
185 #define ISLOWER(c) islower(c)
186 #define ISPUNCT(c) ispunct(c)
187 #define ISSPACE(c) isspace(c)
188 #define ISUPPER(c) isupper(c)
189 #define ISXDIGIT(c) isxdigit(c)
190 
191 /* Match pattern "p" against "text" */
dowild(const uchar * p,const uchar * text,unsigned int flags)192 static int dowild(const uchar *p, const uchar *text, unsigned int flags)
193 {
194 	uchar p_ch;
195 	const uchar *pattern = p;
196 
197 	for ( ; (p_ch = *p) != '\0'; text++, p++) {
198 		int matched, match_slash, negated;
199 		uchar t_ch, prev_ch;
200 		if ((t_ch = *text) == '\0' && p_ch != '*')
201 			return WM_ABORT_ALL;
202 		if ((flags & WM_CASEFOLD) && ISUPPER(t_ch))
203 			t_ch = tolower(t_ch);
204 		if ((flags & WM_CASEFOLD) && ISUPPER(p_ch))
205 			p_ch = tolower(p_ch);
206 		switch (p_ch) {
207 		case '\\':
208 			/* Literal match with following character.  Note that the test
209 			 * in "default" handles the p[1] == '\0' failure case. */
210 			p_ch = *++p;
211 			/* FALLTHROUGH */
212 		default:
213 			if (t_ch != p_ch)
214 				return WM_NOMATCH;
215 			continue;
216 		case '?':
217 			/* Match anything but '/'. */
218 			if ((flags & WM_PATHNAME) && t_ch == '/')
219 				return WM_NOMATCH;
220 			continue;
221 		case '*':
222 			if (*++p == '*') {
223 				const uchar *prev_p = p - 2;
224 				while (*++p == '*') {}
225 				if (!(flags & WM_PATHNAME))
226 					/* without WM_PATHNAME, '*' == '**' */
227 					match_slash = 1;
228 				else if ((prev_p < pattern || *prev_p == '/') &&
229 				    (*p == '\0' || *p == '/' ||
230 				     (p[0] == '\\' && p[1] == '/'))) {
231 					/*
232 					 * Assuming we already match 'foo/' and are at
233 					 * <star star slash>, just assume it matches
234 					 * nothing and go ahead match the rest of the
235 					 * pattern with the remaining string. This
236 					 * helps make foo/<*><*>/bar (<> because
237 					 * otherwise it breaks C comment syntax) match
238 					 * both foo/bar and foo/a/bar.
239 					 */
240 					if (p[0] == '/' &&
241 					    dowild(p + 1, text, flags) == WM_MATCH)
242 						return WM_MATCH;
243 					match_slash = 1;
244 				} else /* WM_PATHNAME is set */
245 					match_slash = 0;
246 			} else
247 				/* without WM_PATHNAME, '*' == '**' */
248 				match_slash = flags & WM_PATHNAME ? 0 : 1;
249 			if (*p == '\0') {
250 				/* Trailing "**" matches everything.  Trailing "*" matches
251 				 * only if there are no more slash characters. */
252 				if (!match_slash) {
253 					if (strchr((char *)text, '/'))
254 						return WM_NOMATCH;
255 				}
256 				return WM_MATCH;
257 			} else if (!match_slash && *p == '/') {
258 				/*
259 				 * _one_ asterisk followed by a slash
260 				 * with WM_PATHNAME matches the next
261 				 * directory
262 				 */
263 				const char *slash = strchr((char*)text, '/');
264 				if (!slash)
265 					return WM_NOMATCH;
266 				text = (const uchar*)slash;
267 				/* the slash is consumed by the top-level for loop */
268 				break;
269 			}
270 			while (1) {
271 				if (t_ch == '\0')
272 					break;
273 				/*
274 				 * Try to advance faster when an asterisk is
275 				 * followed by a literal. We know in this case
276 				 * that the string before the literal
277 				 * must belong to "*".
278 				 * If match_slash is false, do not look past
279 				 * the first slash as it cannot belong to '*'.
280 				 */
281 				if (!is_glob_special(*p)) {
282 					p_ch = *p;
283 					if ((flags & WM_CASEFOLD) && ISUPPER(p_ch))
284 						p_ch = tolower(p_ch);
285 					while ((t_ch = *text) != '\0' &&
286 					       (match_slash || t_ch != '/')) {
287 						if ((flags & WM_CASEFOLD) && ISUPPER(t_ch))
288 							t_ch = tolower(t_ch);
289 						if (t_ch == p_ch)
290 							break;
291 						text++;
292 					}
293 					if (t_ch != p_ch)
294 						return WM_NOMATCH;
295 				}
296 				if ((matched = dowild(p, text, flags)) != WM_NOMATCH) {
297 					if (!match_slash || matched != WM_ABORT_TO_STARSTAR)
298 						return matched;
299 				} else if (!match_slash && t_ch == '/')
300 					return WM_ABORT_TO_STARSTAR;
301 				t_ch = *++text;
302 			}
303 			return WM_ABORT_ALL;
304 		case '[':
305 			p_ch = *++p;
306 			if (p_ch == '^')
307 				p_ch = '!';
308 			/* Assign literal 1/0 because of "matched" comparison. */
309 			negated = p_ch == '!' ? 1 : 0;
310 			if (negated) {
311 				/* Inverted character class. */
312 				p_ch = *++p;
313 			}
314 			prev_ch = 0;
315 			matched = 0;
316 			do {
317 				if (!p_ch)
318 					return WM_ABORT_ALL;
319 				if (p_ch == '\\') {
320 					p_ch = *++p;
321 					if (!p_ch)
322 						return WM_ABORT_ALL;
323 					if (t_ch == p_ch)
324 						matched = 1;
325 				} else if (p_ch == '-' && prev_ch && p[1] && p[1] != ']') {
326 					p_ch = *++p;
327 					if (p_ch == '\\') {
328 						p_ch = *++p;
329 						if (!p_ch)
330 							return WM_ABORT_ALL;
331 					}
332 					if (t_ch <= p_ch && t_ch >= prev_ch)
333 						matched = 1;
334 					else if ((flags & WM_CASEFOLD) && ISLOWER(t_ch)) {
335 						uchar t_ch_upper = toupper(t_ch);
336 						if (t_ch_upper <= p_ch && t_ch_upper >= prev_ch)
337 							matched = 1;
338 					}
339 					p_ch = 0; /* This makes "prev_ch" get set to 0. */
340 				} else if (p_ch == '[' && p[1] == ':') {
341 					const uchar *s;
342 					int i;
343 					for (s = p += 2; (p_ch = *p) && p_ch != ']'; p++) {} /*SHARED ITERATOR*/
344 					if (!p_ch)
345 						return WM_ABORT_ALL;
346 					i = p - s - 1;
347 					if (i < 0 || p[-1] != ':') {
348 						/* Didn't find ":]", so treat like a normal set. */
349 						p = s - 2;
350 						p_ch = '[';
351 						if (t_ch == p_ch)
352 							matched = 1;
353 						continue;
354 					}
355 					if (CC_EQ(s,i, "alnum")) {
356 						if (ISALNUM(t_ch))
357 							matched = 1;
358 					} else if (CC_EQ(s,i, "alpha")) {
359 						if (ISALPHA(t_ch))
360 							matched = 1;
361 					} else if (CC_EQ(s,i, "blank")) {
362 						if (ISBLANK(t_ch))
363 							matched = 1;
364 					} else if (CC_EQ(s,i, "cntrl")) {
365 						if (ISCNTRL(t_ch))
366 							matched = 1;
367 					} else if (CC_EQ(s,i, "digit")) {
368 						if (ISDIGIT(t_ch))
369 							matched = 1;
370 					} else if (CC_EQ(s,i, "graph")) {
371 						if (ISGRAPH(t_ch))
372 							matched = 1;
373 					} else if (CC_EQ(s,i, "lower")) {
374 						if (ISLOWER(t_ch))
375 							matched = 1;
376 					} else if (CC_EQ(s,i, "print")) {
377 						if (ISPRINT(t_ch))
378 							matched = 1;
379 					} else if (CC_EQ(s,i, "punct")) {
380 						if (ISPUNCT(t_ch))
381 							matched = 1;
382 					} else if (CC_EQ(s,i, "space")) {
383 						if (ISSPACE(t_ch))
384 							matched = 1;
385 					} else if (CC_EQ(s,i, "upper")) {
386 						if (ISUPPER(t_ch))
387 							matched = 1;
388 						else if ((flags & WM_CASEFOLD) && ISLOWER(t_ch))
389 							matched = 1;
390 					} else if (CC_EQ(s,i, "xdigit")) {
391 						if (ISXDIGIT(t_ch))
392 							matched = 1;
393 					} else /* malformed [:class:] string */
394 						return WM_ABORT_ALL;
395 					p_ch = 0; /* This makes "prev_ch" get set to 0. */
396 				} else if (t_ch == p_ch)
397 					matched = 1;
398 			} while (prev_ch = p_ch, (p_ch = *++p) != ']');
399 			if (matched == negated ||
400 			    ((flags & WM_PATHNAME) && t_ch == '/'))
401 				return WM_NOMATCH;
402 			continue;
403 		}
404 	}
405 
406 	return *text ? WM_NOMATCH : WM_MATCH;
407 }
408 
409 /* Match the "pattern" against the "text" string. */
wildmatch(const char * pattern,const char * text,unsigned int flags)410 static int wildmatch(const char *pattern, const char *text, unsigned int flags)
411 {
412 	// local modification: move WM_CASEFOLD here
413 	if (ignore_case)
414 		flags |= WM_CASEFOLD;
415 
416 	return dowild((const uchar*)pattern, (const uchar*)text, flags);
417 }
418 
419 // Copied from dir.h
420 
421 #define PATTERN_FLAG_NODIR 1
422 #define PATTERN_FLAG_ENDSWITH 4
423 #define PATTERN_FLAG_MUSTBEDIR 8
424 #define PATTERN_FLAG_NEGATIVE 16
425 
426 // Copied from dir.c
427 
fspathncmp(const char * a,const char * b,size_t count)428 static int fspathncmp(const char *a, const char *b, size_t count)
429 {
430 	return ignore_case ? strncasecmp(a, b, count) : strncmp(a, b, count);
431 }
432 
simple_length(const char * match)433 static int simple_length(const char *match)
434 {
435 	int len = -1;
436 
437 	for (;;) {
438 		unsigned char c = *match++;
439 		len++;
440 		if (c == '\0' || is_glob_special(c))
441 			return len;
442 	}
443 }
444 
no_wildcard(const char * string)445 static int no_wildcard(const char *string)
446 {
447 	return string[simple_length(string)] == '\0';
448 }
449 
parse_path_pattern(const char ** pattern,int * patternlen,unsigned * flags,int * nowildcardlen)450 static void parse_path_pattern(const char **pattern,
451 			       int *patternlen,
452 			       unsigned *flags,
453 			       int *nowildcardlen)
454 {
455 	const char *p = *pattern;
456 	size_t i, len;
457 
458 	*flags = 0;
459 	if (*p == '!') {
460 		*flags |= PATTERN_FLAG_NEGATIVE;
461 		p++;
462 	}
463 	len = strlen(p);
464 	if (len && p[len - 1] == '/') {
465 		len--;
466 		*flags |= PATTERN_FLAG_MUSTBEDIR;
467 	}
468 	for (i = 0; i < len; i++) {
469 		if (p[i] == '/')
470 			break;
471 	}
472 	if (i == len)
473 		*flags |= PATTERN_FLAG_NODIR;
474 	*nowildcardlen = simple_length(p);
475 	/*
476 	 * we should have excluded the trailing slash from 'p' too,
477 	 * but that's one more allocation. Instead just make sure
478 	 * nowildcardlen does not exceed real patternlen
479 	 */
480 	if (*nowildcardlen > len)
481 		*nowildcardlen = len;
482 	if (*p == '*' && no_wildcard(p + 1))
483 		*flags |= PATTERN_FLAG_ENDSWITH;
484 	*pattern = p;
485 	*patternlen = len;
486 }
487 
trim_trailing_spaces(char * buf)488 static void trim_trailing_spaces(char *buf)
489 {
490 	char *p, *last_space = NULL;
491 
492 	for (p = buf; *p; p++)
493 		switch (*p) {
494 		case ' ':
495 			if (!last_space)
496 				last_space = p;
497 			break;
498 		case '\\':
499 			p++;
500 			if (!*p)
501 				return;
502 			/* fallthrough */
503 		default:
504 			last_space = NULL;
505 		}
506 
507 	if (last_space)
508 		*last_space = '\0';
509 }
510 
match_basename(const char * basename,int basenamelen,const char * pattern,int prefix,int patternlen,unsigned flags)511 static int match_basename(const char *basename, int basenamelen,
512 			  const char *pattern, int prefix, int patternlen,
513 			  unsigned flags)
514 {
515 	if (prefix == patternlen) {
516 		if (patternlen == basenamelen &&
517 		    !fspathncmp(pattern, basename, basenamelen))
518 			return 1;
519 	} else if (flags & PATTERN_FLAG_ENDSWITH) {
520 		/* "*literal" matching against "fooliteral" */
521 		if (patternlen - 1 <= basenamelen &&
522 		    !fspathncmp(pattern + 1,
523 				   basename + basenamelen - (patternlen - 1),
524 				   patternlen - 1))
525 			return 1;
526 	} else {
527 		// local modification: call wildmatch() directly
528 		if (!wildmatch(pattern, basename, flags))
529 			return 1;
530 	}
531 	return 0;
532 }
533 
match_pathname(const char * pathname,int pathlen,const char * base,int baselen,const char * pattern,int prefix,int patternlen)534 static int match_pathname(const char *pathname, int pathlen,
535 			  const char *base, int baselen,
536 			  const char *pattern, int prefix, int patternlen)
537 {
538 	// local modification: remove local variables
539 
540 	/*
541 	 * match with FNM_PATHNAME; the pattern has base implicitly
542 	 * in front of it.
543 	 */
544 	if (*pattern == '/') {
545 		pattern++;
546 		patternlen--;
547 		prefix--;
548 	}
549 
550 	/*
551 	 * baselen does not count the trailing slash. base[] may or
552 	 * may not end with a trailing slash though.
553 	 */
554 	if (pathlen < baselen + 1 ||
555 	    (baselen && pathname[baselen] != '/') ||
556 	    fspathncmp(pathname, base, baselen))
557 		return 0;
558 
559 	// local modification: simplified because always baselen > 0
560 	pathname += baselen + 1;
561 	pathlen -= baselen + 1;
562 
563 	if (prefix) {
564 		/*
565 		 * if the non-wildcard part is longer than the
566 		 * remaining pathname, surely it cannot match.
567 		 */
568 		if (prefix > pathlen)
569 			return 0;
570 
571 		if (fspathncmp(pattern, pathname, prefix))
572 			return 0;
573 		pattern += prefix;
574 		patternlen -= prefix;
575 		pathname += prefix;
576 		pathlen -= prefix;
577 
578 		/*
579 		 * If the whole pattern did not have a wildcard,
580 		 * then our prefix match is all we need; we
581 		 * do not need to call fnmatch at all.
582 		 */
583 		if (!patternlen && !pathlen)
584 			return 1;
585 	}
586 
587 	// local modification: call wildmatch() directly
588 	return !wildmatch(pattern, pathname, WM_PATHNAME);
589 }
590 
591 // Copied from git/utf8.c
592 
593 static const char utf8_bom[] = "\357\273\277";
594 
595 //----------------------------(IMPORT FROM GIT END)----------------------------
596 
597 struct pattern {
598 	unsigned int flags;
599 	int nowildcardlen;
600 	int patternlen;
601 	int dirlen;
602 	char pattern[];
603 };
604 
605 static struct pattern **pattern_list;
606 static int nr_patterns, alloced_patterns;
607 
608 // Remember the number of patterns at each directory level
609 static int *nr_patterns_at;
610 // Track the current/max directory level;
611 static int depth, max_depth;
612 static bool debug_on;
613 static FILE *out_fp, *stat_fp;
614 static char *prefix = "";
615 static char *progname;
616 
perror_exit(const char * s)617 static void __attribute__((noreturn)) perror_exit(const char *s)
618 {
619 	perror(s);
620 
621 	exit(EXIT_FAILURE);
622 }
623 
error_exit(const char * fmt,...)624 static void __attribute__((noreturn)) error_exit(const char *fmt, ...)
625 {
626 	va_list args;
627 
628 	fprintf(stderr, "%s: error: ", progname);
629 
630 	va_start(args, fmt);
631 	vfprintf(stderr, fmt, args);
632 	va_end(args);
633 
634 	exit(EXIT_FAILURE);
635 }
636 
debug(const char * fmt,...)637 static void debug(const char *fmt, ...)
638 {
639 	va_list args;
640 	int i;
641 
642 	if (!debug_on)
643 		return;
644 
645 	fprintf(stderr, "[DEBUG] ");
646 
647 	for (i = 0; i < depth * 2; i++)
648 		fputc(' ', stderr);
649 
650 	va_start(args, fmt);
651 	vfprintf(stderr, fmt, args);
652 	va_end(args);
653 }
654 
xrealloc(void * ptr,size_t size)655 static void *xrealloc(void *ptr, size_t size)
656 {
657 	ptr = realloc(ptr, size);
658 	if (!ptr)
659 		perror_exit(progname);
660 
661 	return ptr;
662 }
663 
xmalloc(size_t size)664 static void *xmalloc(size_t size)
665 {
666 	return xrealloc(NULL, size);
667 }
668 
669 // similar to last_matching_pattern_from_list() in GIT
is_ignored(const char * path,int pathlen,int dirlen,bool is_dir)670 static bool is_ignored(const char *path, int pathlen, int dirlen, bool is_dir)
671 {
672 	int i;
673 
674 	// Search in the reverse order because the last matching pattern wins.
675 	for (i = nr_patterns - 1; i >= 0; i--) {
676 		struct pattern *p = pattern_list[i];
677 		unsigned int flags = p->flags;
678 		const char *gitignore_dir = p->pattern + p->patternlen + 1;
679 		bool ignored;
680 
681 		if ((flags & PATTERN_FLAG_MUSTBEDIR) && !is_dir)
682 			continue;
683 
684 		if (flags & PATTERN_FLAG_NODIR) {
685 			if (!match_basename(path + dirlen + 1,
686 					    pathlen - dirlen - 1,
687 					    p->pattern,
688 					    p->nowildcardlen,
689 					    p->patternlen,
690 					    p->flags))
691 				continue;
692 		} else {
693 			if (!match_pathname(path, pathlen,
694 					    gitignore_dir, p->dirlen,
695 					    p->pattern,
696 					    p->nowildcardlen,
697 					    p->patternlen))
698 				continue;
699 		}
700 
701 		debug("%s: matches %s%s%s (%s/.gitignore)\n", path,
702 		      flags & PATTERN_FLAG_NEGATIVE ? "!" : "", p->pattern,
703 		      flags & PATTERN_FLAG_MUSTBEDIR ? "/" : "",
704 		      gitignore_dir);
705 
706 		ignored = (flags & PATTERN_FLAG_NEGATIVE) == 0;
707 		if (ignored)
708 			debug("Ignore: %s\n", path);
709 
710 		return ignored;
711 	}
712 
713 	debug("%s: no match\n", path);
714 
715 	return false;
716 }
717 
add_pattern(const char * string,const char * dir,int dirlen)718 static void add_pattern(const char *string, const char *dir, int dirlen)
719 {
720 	struct pattern *p;
721 	int patternlen, nowildcardlen;
722 	unsigned int flags;
723 
724 	parse_path_pattern(&string, &patternlen, &flags, &nowildcardlen);
725 
726 	if (patternlen == 0)
727 		return;
728 
729 	p = xmalloc(sizeof(*p) + patternlen + dirlen + 2);
730 
731 	memcpy(p->pattern, string, patternlen);
732 	p->pattern[patternlen] = 0;
733 	memcpy(p->pattern + patternlen + 1, dir, dirlen);
734 	p->pattern[patternlen + 1 + dirlen] = 0;
735 
736 	p->patternlen = patternlen;
737 	p->nowildcardlen = nowildcardlen;
738 	p->dirlen = dirlen;
739 	p->flags = flags;
740 
741 	debug("Add pattern: %s%s%s\n",
742 	      flags & PATTERN_FLAG_NEGATIVE ? "!" : "", p->pattern,
743 	      flags & PATTERN_FLAG_MUSTBEDIR ? "/" : "");
744 
745 	if (nr_patterns >= alloced_patterns) {
746 		alloced_patterns += 128;
747 		pattern_list = xrealloc(pattern_list,
748 					sizeof(*pattern_list) * alloced_patterns);
749 	}
750 
751 	pattern_list[nr_patterns++] = p;
752 }
753 
754 // similar to add_patterns_from_buffer() in GIT
add_patterns_from_gitignore(const char * dir,int dirlen)755 static void add_patterns_from_gitignore(const char *dir, int dirlen)
756 {
757 	struct stat st;
758 	char path[PATH_MAX], *buf, *entry;
759 	size_t size;
760 	int fd, pathlen, i;
761 
762 	pathlen = snprintf(path, sizeof(path), "%s/.gitignore", dir);
763 	if (pathlen >= sizeof(path))
764 		error_exit("%s: too long path was truncated\n", path);
765 
766 	fd = open(path, O_RDONLY | O_NOFOLLOW);
767 	if (fd < 0) {
768 		if (errno != ENOENT)
769 			return perror_exit(path);
770 		return;
771 	}
772 
773 	if (fstat(fd, &st) < 0)
774 		perror_exit(path);
775 
776 	size = st.st_size;
777 
778 	buf = xmalloc(size + 1);
779 	if (read(fd, buf, st.st_size) != st.st_size)
780 		perror_exit(path);
781 
782 	buf[st.st_size] = '\n';
783 	if (close(fd))
784 		perror_exit(path);
785 
786 	debug("Parse %s\n", path);
787 
788 	entry = buf;
789 
790 	// skip utf8 bom
791 	if (!strncmp(entry, utf8_bom, strlen(utf8_bom)))
792 		entry += strlen(utf8_bom);
793 
794 	for (i = entry - buf; i < size; i++) {
795 		if (buf[i] == '\n') {
796 			if (entry != buf + i && entry[0] != '#') {
797 				buf[i - (i && buf[i-1] == '\r')] = 0;
798 				trim_trailing_spaces(entry);
799 				add_pattern(entry, dir, dirlen);
800 			}
801 			entry = buf + i + 1;
802 		}
803 	}
804 
805 	free(buf);
806 }
807 
808 // Save the current number of patterns and increment the depth
increment_depth(void)809 static void increment_depth(void)
810 {
811 	if (depth >= max_depth) {
812 		max_depth += 1;
813 		nr_patterns_at = xrealloc(nr_patterns_at,
814 					  sizeof(*nr_patterns_at) * max_depth);
815 	}
816 
817 	nr_patterns_at[depth] = nr_patterns;
818 	depth++;
819 }
820 
821 // Decrement the depth, and free up the patterns of this directory level.
decrement_depth(void)822 static void decrement_depth(void)
823 {
824 	depth--;
825 	assert(depth >= 0);
826 
827 	while (nr_patterns > nr_patterns_at[depth])
828 		free(pattern_list[--nr_patterns]);
829 }
830 
print_path(const char * path)831 static void print_path(const char *path)
832 {
833 	// The path always starts with "./"
834 	assert(strlen(path) >= 2);
835 
836 	// Replace the root directory with a preferred prefix.
837 	// This is useful for the tar command.
838 	fprintf(out_fp, "%s%s\n", prefix, path + 2);
839 }
840 
print_stat(const char * path,struct stat * st)841 static void print_stat(const char *path, struct stat *st)
842 {
843 	if (!stat_fp)
844 		return;
845 
846 	if (!S_ISREG(st->st_mode) && !S_ISLNK(st->st_mode))
847 		return;
848 
849 	assert(strlen(path) >= 2);
850 
851 	fprintf(stat_fp, "%c %9ld %10ld %s\n",
852 		S_ISLNK(st->st_mode) ? 'l' : '-',
853 		st->st_size, st->st_mtim.tv_sec, path + 2);
854 }
855 
856 // Traverse the entire directory tree, parsing .gitignore files.
857 // Print file paths that are not tracked by git.
858 //
859 // Return true if all files under the directory are ignored, false otherwise.
traverse_directory(const char * dir,int dirlen)860 static bool traverse_directory(const char *dir, int dirlen)
861 {
862 	bool all_ignored = true;
863 	DIR *dirp;
864 
865 	debug("Enter[%d]: %s\n", depth, dir);
866 	increment_depth();
867 
868 	add_patterns_from_gitignore(dir, dirlen);
869 
870 	dirp = opendir(dir);
871 	if (!dirp)
872 		perror_exit(dir);
873 
874 	while (1) {
875 		struct dirent *d;
876 		struct stat st;
877 		char path[PATH_MAX];
878 		int pathlen;
879 		bool ignored;
880 
881 		errno = 0;
882 		d = readdir(dirp);
883 		if (!d) {
884 			if (errno)
885 				perror_exit(dir);
886 			break;
887 		}
888 
889 		if (!strcmp(d->d_name, "..") || !strcmp(d->d_name, "."))
890 			continue;
891 
892 		pathlen = snprintf(path, sizeof(path), "%s/%s", dir, d->d_name);
893 		if (pathlen >= sizeof(path))
894 			error_exit("%s: too long path was truncated\n", path);
895 
896 		if (lstat(path, &st) < 0)
897 			perror_exit(path);
898 
899 		if ((!S_ISREG(st.st_mode) && !S_ISDIR(st.st_mode) && !S_ISLNK(st.st_mode)) ||
900 		    is_ignored(path, pathlen, dirlen, S_ISDIR(st.st_mode))) {
901 			ignored = true;
902 		} else {
903 			if (S_ISDIR(st.st_mode) && !S_ISLNK(st.st_mode))
904 				// If all the files in a directory are ignored,
905 				// let's ignore that directory as well. This
906 				// will avoid empty directories in the tarball.
907 				ignored = traverse_directory(path, pathlen);
908 			else
909 				ignored = false;
910 		}
911 
912 		if (ignored) {
913 			print_path(path);
914 		} else {
915 			print_stat(path, &st);
916 			all_ignored = false;
917 		}
918 	}
919 
920 	if (closedir(dirp))
921 		perror_exit(dir);
922 
923 	decrement_depth();
924 	debug("Leave[%d]: %s\n", depth, dir);
925 
926 	return all_ignored;
927 }
928 
usage(void)929 static void usage(void)
930 {
931 	fprintf(stderr,
932 		"usage: %s [options]\n"
933 		"\n"
934 		"Show files that are ignored by git\n"
935 		"\n"
936 		"options:\n"
937 		"  -d, --debug                  print debug messages to stderr\n"
938 		"  -e, --exclude PATTERN        add the given exclude pattern\n"
939 		"  -h, --help                   show this help message and exit\n"
940 		"  -i, --ignore-case            Ignore case differences between the patterns and the files\n"
941 		"  -o, --output FILE            output the ignored files to a file (default: '-', i.e. stdout)\n"
942 		"  -p, --prefix PREFIX          prefix added to each path (default: empty string)\n"
943 		"  -r, --rootdir DIR            root of the source tree (default: current working directory)\n"
944 		"  -s, --stat FILE              output the file stat of non-ignored files to a file\n",
945 		progname);
946 }
947 
open_output(const char * pathname,FILE ** fp)948 static void open_output(const char *pathname, FILE **fp)
949 {
950 	if (strcmp(pathname, "-")) {
951 		*fp = fopen(pathname, "w");
952 		if (!*fp)
953 			perror_exit(pathname);
954 	} else {
955 		*fp = stdout;
956 	}
957 }
958 
close_output(const char * pathname,FILE * fp)959 static void close_output(const char *pathname, FILE *fp)
960 {
961 	fflush(fp);
962 
963 	if (ferror(fp))
964 		error_exit("not all data was written to the output\n");
965 
966 	if (fclose(fp))
967 		perror_exit(pathname);
968 }
969 
main(int argc,char * argv[])970 int main(int argc, char *argv[])
971 {
972 	const char *output = "-";
973 	const char *rootdir = ".";
974 	const char *stat = NULL;
975 
976 	progname = strrchr(argv[0], '/');
977 	if (progname)
978 		progname++;
979 	else
980 		progname = argv[0];
981 
982 	while (1) {
983 		static struct option long_options[] = {
984 			{"debug",       no_argument,       NULL, 'd'},
985 			{"help",        no_argument,       NULL, 'h'},
986 			{"ignore-case", no_argument,       NULL, 'i'},
987 			{"output",      required_argument, NULL, 'o'},
988 			{"prefix",      required_argument, NULL, 'p'},
989 			{"rootdir",     required_argument, NULL, 'r'},
990 			{"stat",        required_argument, NULL, 's'},
991 			{"exclude",     required_argument, NULL, 'x'},
992 			{},
993 		};
994 
995 		int c = getopt_long(argc, argv, "dhino:p:r:s:x:", long_options, NULL);
996 
997 		if (c == -1)
998 			break;
999 
1000 		switch (c) {
1001 		case 'd':
1002 			debug_on = true;
1003 			break;
1004 		case 'h':
1005 			usage();
1006 			exit(0);
1007 		case 'i':
1008 			ignore_case = true;
1009 			break;
1010 		case 'o':
1011 			output = optarg;
1012 			break;
1013 		case 'p':
1014 			prefix = optarg;
1015 			break;
1016 		case 'r':
1017 			rootdir = optarg;
1018 			break;
1019 		case 's':
1020 			stat = optarg;
1021 			break;
1022 		case 'x':
1023 			add_pattern(optarg, ".", strlen("."));
1024 			break;
1025 		case '?':
1026 			usage();
1027 			/* fallthrough */
1028 		default:
1029 			exit(EXIT_FAILURE);
1030 		}
1031 	}
1032 
1033 	open_output(output, &out_fp);
1034 	if (stat && stat[0])
1035 		open_output(stat, &stat_fp);
1036 
1037 	if (chdir(rootdir))
1038 		perror_exit(rootdir);
1039 
1040 	add_pattern(".git/", ".", strlen("."));
1041 
1042 	if (traverse_directory(".", strlen(".")))
1043 		print_path("./");
1044 
1045 	assert(depth == 0);
1046 
1047 	while (nr_patterns > 0)
1048 		free(pattern_list[--nr_patterns]);
1049 	free(pattern_list);
1050 	free(nr_patterns_at);
1051 
1052 	close_output(output, out_fp);
1053 	if (stat_fp)
1054 		close_output(stat, stat_fp);
1055 
1056 	return 0;
1057 }
1058