mirror of
https://gitflic.ru/project/openide/openide.git
synced 2025-12-15 11:53:49 +07:00
(cherry picked from commit fdadb26083074b6e510df5c8b66bb3ee13bdb8c3) IJ-CR-105378 GitOrigin-RevId: b5d0d16b1bba3802191775d9400b23205242238f
360 lines
11 KiB
C
360 lines
11 KiB
C
#define XXH_VECTOR XXH_SSE2
|
|
#define XXH_STATIC_LINKING_ONLY 1
|
|
|
|
#include <stdio.h>
|
|
#include <limits.h>
|
|
#include <stdlib.h>
|
|
#include <string.h>
|
|
#include <unistd.h>
|
|
#include "xxhash.h"
|
|
#include <ftw.h>
|
|
#include <fcntl.h>
|
|
#include <sys/mman.h>
|
|
#include <errno.h>
|
|
#include <locale.h>
|
|
#include <langinfo.h>
|
|
#include <stdbool.h>
|
|
#include <stdarg.h>
|
|
#include <regex.h>
|
|
|
|
// Usage:
|
|
// wslhash [OPTIONS] DIR
|
|
//
|
|
// Options:
|
|
// -n
|
|
// skip the hash calculation step.
|
|
// -f FILTER
|
|
// filters the files using the given FILTER. May be specified multiple times.
|
|
// -s
|
|
// report files for stubbing e.g. files that exists, but were filtered out (explicitly or implicitly).
|
|
//
|
|
// Description:
|
|
// Calculate hashes (unless `-n`) for all files in the given DIR.
|
|
// Files can be filtered using `-f` option.
|
|
//
|
|
// Filters:
|
|
// Each filter must be specified in the following format:
|
|
// OPERATOR:MATCHER:PATTERN
|
|
// where OPERATOR is one of:
|
|
// `-` to exclude
|
|
// `+` to include
|
|
// and MATCHER is one of:
|
|
// `rgx` for matching using extended regular expressions in PATTERN (see regex(3))
|
|
//
|
|
// The combination of OPERATORS dictates the filtering behavior:
|
|
// only `-`, means process all files that do not match any excludes
|
|
// only `+`, means process only files that match any includes
|
|
// both `+` and `-`, mean process all files that do not match any excludes or match any includes
|
|
//
|
|
// Filters within each OPERATOR group are processed in the order of appearance in the command line.
|
|
//
|
|
// Output format:
|
|
// [FILE_PATH]\0[HASH]
|
|
// where HASH is little-endian 8 byte (64 bit) integer
|
|
// [LINK_PATH]\1[LINK_LEN][LINK]
|
|
// where LINK_LEN is 4 byte (32 bit) signed int
|
|
// [STUB_PATH]\2
|
|
|
|
//#define WSLHASH_DEBUG 1
|
|
#ifdef WSLHASH_DEBUG
|
|
#define DEBUG_PRINTF(fmt, ...) \
|
|
do { fprintf(stderr, "%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0)
|
|
#else
|
|
#define DEBUG_PRINTF(fmt, ...)
|
|
#endif // WSLHASH_DEBUG
|
|
|
|
#define STRINGIFY_(a) #a
|
|
#define STRINGIFY(a) STRINGIFY_(a)
|
|
|
|
#define FLT_N_MAX 50
|
|
#define FLT_MATCHER_LEN_MAX 3
|
|
#define FLT_PATTERN_LEN_MAX 64
|
|
#define FLT_NAME_LEN_MAX (1 + FLT_MATCHER_LEN_MAX + FLT_PATTERN_LEN_MAX + 2) // OPERATOR + MATCHER + PATTERN + delims
|
|
#define FLT_SCAN_FMT "%c:%" STRINGIFY(FLT_MATCHER_LEN_MAX) "s:%" STRINGIFY(FLT_PATTERN_LEN_MAX) "s"
|
|
|
|
#define FILE_SEPARATOR 0
|
|
#define LINK_SEPARATOR 1
|
|
#define STUB_SEPARATOR 2
|
|
|
|
struct wslhash_filter_t {
|
|
char name[FLT_NAME_LEN_MAX + 1]; // full filter name (OPERATOR:MATCHER:PATTERN).
|
|
|
|
void *pattern; // points to arbitrary pattern object.
|
|
|
|
int (*fn_match)(const struct wslhash_filter_t *, const char *); // returns 1 if matches, 0 otherwise.
|
|
|
|
void (*fn_init)(struct wslhash_filter_t *, const char *); // initializes the filter.
|
|
|
|
void (*fn_free)(const struct wslhash_filter_t *); // destroys the filter.
|
|
};
|
|
|
|
struct wslhash_options_t {
|
|
char root_dir[PATH_MAX];
|
|
size_t root_dir_len;
|
|
|
|
struct wslhash_filter_t excludes[FLT_N_MAX];
|
|
size_t excludes_len;
|
|
|
|
struct wslhash_filter_t includes[FLT_N_MAX];
|
|
size_t includes_len;
|
|
|
|
int skip_hash;
|
|
int report_stubs;
|
|
};
|
|
|
|
|
|
static const char EMPTY[sizeof(XXH64_hash_t)] = {0};
|
|
|
|
static struct wslhash_options_t g_options = {0};
|
|
|
|
|
|
static void free_all(void) {
|
|
const struct wslhash_filter_t *filter;
|
|
for (size_t i = 0; i < g_options.excludes_len; i++) {
|
|
filter = &g_options.excludes[i];
|
|
filter->fn_free(filter);
|
|
}
|
|
for (size_t i = 0; i < g_options.includes_len; i++) {
|
|
filter = &g_options.includes[i];
|
|
filter->fn_free(filter);
|
|
}
|
|
}
|
|
|
|
static int any_match(const struct wslhash_filter_t *filters, const size_t filter_len, const char *filename) {
|
|
const struct wslhash_filter_t *filter;
|
|
for (size_t i = 0; i < filter_len; i++) {
|
|
filter = &filters[i];
|
|
if (filter->fn_match(filter, filename)) {
|
|
DEBUG_PRINTF("File matched a filter '%s': %s\n", filter->name, filename);
|
|
return true;
|
|
}
|
|
}
|
|
return false;
|
|
}
|
|
|
|
static int is_filename_ok(const char *filename) {
|
|
DEBUG_PRINTF("Checking file: %s\n", filename);
|
|
if (g_options.excludes_len == 0 && g_options.includes_len == 0) {
|
|
return true;
|
|
}
|
|
if (g_options.excludes_len == 0) {
|
|
return any_match(g_options.includes, g_options.includes_len, filename);
|
|
}
|
|
if (g_options.includes_len == 0) {
|
|
return !any_match(g_options.excludes, g_options.excludes_len, filename);
|
|
}
|
|
return !any_match(g_options.excludes, g_options.excludes_len, filename) ||
|
|
any_match(g_options.includes, g_options.includes_len, filename);
|
|
}
|
|
|
|
static int is_dir(const char *path) {
|
|
struct stat stat_info = {0};
|
|
if (stat(path, &stat_info) != 0) {
|
|
return false;
|
|
}
|
|
return S_ISDIR(stat_info.st_mode);
|
|
}
|
|
|
|
static const char *filename(const char *fpath) {
|
|
const char *last_slash = strrchr(fpath, '/');
|
|
return (last_slash != NULL) ? last_slash + 1 : fpath;
|
|
}
|
|
|
|
// Called on each file
|
|
static int
|
|
process_file(const char *fpath, const struct stat *sb, int tflag, __attribute__((unused)) struct FTW *ftwbuf) {
|
|
DEBUG_PRINTF("Processing file: %s\n", fpath);
|
|
if (tflag != FTW_F && tflag != FTW_SL) {
|
|
DEBUG_PRINTF("Skipping: %s\n", fpath);
|
|
return 0; // Not a file
|
|
}
|
|
const char *fpath_relative = fpath + g_options.root_dir_len + 1; // remove first "/"
|
|
if (tflag == FTW_F) {
|
|
if (!is_filename_ok(filename(fpath))) {
|
|
DEBUG_PRINTF("Excluding file: %s\n", fpath);
|
|
if (g_options.report_stubs) {
|
|
printf("%s%c", fpath_relative, STUB_SEPARATOR);
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
printf("%s%c", fpath_relative, FILE_SEPARATOR);
|
|
if (sb->st_size == 0 || g_options.skip_hash) {
|
|
// No need to calculate hash for empty file
|
|
fwrite(EMPTY, sizeof(EMPTY), 1, stdout);
|
|
return 0;
|
|
}
|
|
const int fd = open(fpath, O_RDONLY);
|
|
if (fd == -1) {
|
|
fprintf(stderr, "Can't open file %s", fpath);
|
|
perror("Can't open file");
|
|
exit(2);
|
|
}
|
|
|
|
// Mmap file and calculate hash
|
|
char *buffer = mmap(NULL, sb->st_size, PROT_READ, MAP_FILE | MAP_PRIVATE, fd, 0);
|
|
madvise(buffer, sb->st_size, MADV_SEQUENTIAL);
|
|
if (buffer == MAP_FAILED) {
|
|
fprintf(stderr, "Can't mmap file %s", fpath);
|
|
perror("Can't mmap file");
|
|
exit(3);
|
|
}
|
|
XXH64_hash_t hash = XXH64(buffer, sb->st_size, 0);
|
|
fwrite(&hash, sizeof(XXH64_hash_t), 1, stdout);
|
|
munmap(buffer, sb->st_size);
|
|
|
|
close(fd);
|
|
} else {
|
|
char real_path[PATH_MAX] = {0};
|
|
if (realpath(fpath, real_path) != NULL && is_dir(real_path)) {
|
|
printf("%s%c", fpath_relative, LINK_SEPARATOR);
|
|
const int32_t len = (int32_t) strlen(real_path);
|
|
fwrite(&len, sizeof(int32_t), 1, stdout);
|
|
fputs(real_path, stdout);
|
|
}
|
|
}
|
|
return 0;
|
|
}
|
|
|
|
static void rgx_init(struct wslhash_filter_t *self, const char *pattern_raw) {
|
|
regex_t *regex = calloc(1, sizeof(regex_t));
|
|
if (!regex) {
|
|
fprintf(stderr, "Calloc failed\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
if (regcomp(regex, pattern_raw, REG_EXTENDED)) {
|
|
fprintf(stderr, "Failed to compile basic regex: %s\n", pattern_raw);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
self->pattern = regex;
|
|
}
|
|
|
|
static void rgx_free(const struct wslhash_filter_t *self) {
|
|
regex_t *regex = self->pattern;
|
|
regfree(regex);
|
|
free(regex);
|
|
}
|
|
|
|
static int rgx_match(const struct wslhash_filter_t *self, const char *path) {
|
|
const regex_t *regex = self->pattern;
|
|
const int result = regexec(regex, path, 0, NULL, 0);
|
|
if (result == REG_OK) {
|
|
return true;
|
|
}
|
|
if (result == REG_NOMATCH) {
|
|
return false;
|
|
}
|
|
char buf[100] = {0};
|
|
regerror(result, regex, buf, sizeof(buf));
|
|
fprintf(stderr, "Regex match failed: %s\n", buf);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
static void parse_filter(const char *arg) {
|
|
char operator;
|
|
char matcher[FLT_MATCHER_LEN_MAX + 1] = {0};
|
|
char pattern_raw[FLT_PATTERN_LEN_MAX + 1] = {0};
|
|
|
|
if (sscanf(arg, FLT_SCAN_FMT, &operator, matcher, pattern_raw) < 3) {
|
|
fprintf(stderr, "Invalid filter format: %s\n", arg);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
struct wslhash_filter_t *filter;
|
|
|
|
if (operator == '-') {
|
|
if (g_options.excludes_len >= FLT_N_MAX) {
|
|
fprintf(stderr, "Too many exclude filters >%d\n", FLT_N_MAX);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
filter = &g_options.excludes[g_options.excludes_len++];
|
|
} else if (operator == '+') {
|
|
if (g_options.includes_len >= FLT_N_MAX) {
|
|
fprintf(stderr, "Too many include filters >%d\n", FLT_N_MAX);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
filter = &g_options.includes[g_options.includes_len++];
|
|
} else {
|
|
fprintf(stderr, "Unknown filter operator '%c': %s\n", operator, arg);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
if (strcmp(matcher, "rgx") == 0) {
|
|
filter->fn_init = rgx_init;
|
|
filter->fn_free = rgx_free;
|
|
filter->fn_match = rgx_match;
|
|
} else {
|
|
fprintf(stderr, "Unknown filter matcher '%s': %s\n", matcher, arg);
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
|
|
strncpy(filter->name, arg, FLT_NAME_LEN_MAX);
|
|
filter->fn_init(filter, pattern_raw);
|
|
}
|
|
|
|
static void parse_args(int argc, char *argv[]) {
|
|
int c;
|
|
while ((c = getopt(argc, argv, "nsf:")) != -1) {
|
|
switch (c) {
|
|
case 's':
|
|
g_options.report_stubs = 1;
|
|
break;
|
|
case 'n':
|
|
g_options.skip_hash = 1;
|
|
break;
|
|
case 'f':
|
|
parse_filter(optarg);
|
|
break;
|
|
default:
|
|
fprintf(stderr, "Invalid options\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
}
|
|
if (optind >= argc) {
|
|
fprintf(stderr, "Dir is missing\n");
|
|
exit(EXIT_FAILURE);
|
|
}
|
|
const char *dir = argv[optind];
|
|
if (!is_dir(dir)) {
|
|
fprintf(stderr, "Provided path is not root_dir\n");
|
|
exit(2);
|
|
}
|
|
if (realpath(dir, g_options.root_dir) == NULL) {
|
|
fprintf(stderr, "realpath failed: %d", errno);
|
|
exit(-1);
|
|
}
|
|
g_options.root_dir_len = strlen(g_options.root_dir);
|
|
}
|
|
|
|
static int ensure_charset(void) {
|
|
setlocale(LC_CTYPE, "");
|
|
const char *charset = nl_langinfo(CODESET);
|
|
|
|
if (strncmp(charset, "UTF-8", sizeof "UTF-8") == 0) {
|
|
// Java side decodes output as UTF-8 and almost all WSL distros use UTF
|
|
return true;
|
|
}
|
|
if (strncmp(charset, "ASCII", sizeof "ASCII") == 0) {
|
|
// ASCII is 7 bit, so english texts could be decoded by java either
|
|
return true;
|
|
}
|
|
// Other charsets aren't used nor supported by WSL
|
|
fprintf(stderr, "Please use UTF-8 locale, not %s", charset);
|
|
return false;
|
|
}
|
|
|
|
int main(int argc, char *argv[]) {
|
|
if (!ensure_charset()) {
|
|
return -1;
|
|
}
|
|
parse_args(argc, argv);
|
|
// number of file descriptors is more or less random taken from example
|
|
// we don't know how many descriptors are available on the particular WSL, but sure not less than 20
|
|
if (nftw(g_options.root_dir, process_file, 20, FTW_MOUNT | FTW_PHYS) == -1) { // Walk through files, see nftw(3)
|
|
perror("nftw failed");
|
|
return 3;
|
|
}
|
|
free_all();
|
|
return EXIT_SUCCESS;
|
|
}
|