Files
openide/native/WslTools/wslhash.c
Maxat Mansurov 29da655cee RUBY-31136 add option to WslSync for creating stubs for filtered out files.
(cherry picked from commit fdadb26083074b6e510df5c8b66bb3ee13bdb8c3)

IJ-CR-105378

GitOrigin-RevId: b5d0d16b1bba3802191775d9400b23205242238f
2023-03-27 21:47:10 +00:00

360 lines
11 KiB
C

#define XXH_VECTOR XXH_SSE2
#define XXH_STATIC_LINKING_ONLY 1
#include <stdio.h>
#include <limits.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "xxhash.h"
#include <ftw.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <errno.h>
#include <locale.h>
#include <langinfo.h>
#include <stdbool.h>
#include <stdarg.h>
#include <regex.h>
// Usage:
// wslhash [OPTIONS] DIR
//
// Options:
// -n
// skip the hash calculation step.
// -f FILTER
// filters the files using the given FILTER. May be specified multiple times.
// -s
// report files for stubbing e.g. files that exists, but were filtered out (explicitly or implicitly).
//
// Description:
// Calculate hashes (unless `-n`) for all files in the given DIR.
// Files can be filtered using `-f` option.
//
// Filters:
// Each filter must be specified in the following format:
// OPERATOR:MATCHER:PATTERN
// where OPERATOR is one of:
// `-` to exclude
// `+` to include
// and MATCHER is one of:
// `rgx` for matching using extended regular expressions in PATTERN (see regex(3))
//
// The combination of OPERATORS dictates the filtering behavior:
// only `-`, means process all files that do not match any excludes
// only `+`, means process only files that match any includes
// both `+` and `-`, mean process all files that do not match any excludes or match any includes
//
// Filters within each OPERATOR group are processed in the order of appearance in the command line.
//
// Output format:
// [FILE_PATH]\0[HASH]
// where HASH is little-endian 8 byte (64 bit) integer
// [LINK_PATH]\1[LINK_LEN][LINK]
// where LINK_LEN is 4 byte (32 bit) signed int
// [STUB_PATH]\2
//#define WSLHASH_DEBUG 1
#ifdef WSLHASH_DEBUG
#define DEBUG_PRINTF(fmt, ...) \
do { fprintf(stderr, "%s:%d:%s: " fmt, __FILE__, __LINE__, __func__, __VA_ARGS__); } while (0)
#else
#define DEBUG_PRINTF(fmt, ...)
#endif // WSLHASH_DEBUG
#define STRINGIFY_(a) #a
#define STRINGIFY(a) STRINGIFY_(a)
#define FLT_N_MAX 50
#define FLT_MATCHER_LEN_MAX 3
#define FLT_PATTERN_LEN_MAX 64
#define FLT_NAME_LEN_MAX (1 + FLT_MATCHER_LEN_MAX + FLT_PATTERN_LEN_MAX + 2) // OPERATOR + MATCHER + PATTERN + delims
#define FLT_SCAN_FMT "%c:%" STRINGIFY(FLT_MATCHER_LEN_MAX) "s:%" STRINGIFY(FLT_PATTERN_LEN_MAX) "s"
#define FILE_SEPARATOR 0
#define LINK_SEPARATOR 1
#define STUB_SEPARATOR 2
struct wslhash_filter_t {
char name[FLT_NAME_LEN_MAX + 1]; // full filter name (OPERATOR:MATCHER:PATTERN).
void *pattern; // points to arbitrary pattern object.
int (*fn_match)(const struct wslhash_filter_t *, const char *); // returns 1 if matches, 0 otherwise.
void (*fn_init)(struct wslhash_filter_t *, const char *); // initializes the filter.
void (*fn_free)(const struct wslhash_filter_t *); // destroys the filter.
};
struct wslhash_options_t {
char root_dir[PATH_MAX];
size_t root_dir_len;
struct wslhash_filter_t excludes[FLT_N_MAX];
size_t excludes_len;
struct wslhash_filter_t includes[FLT_N_MAX];
size_t includes_len;
int skip_hash;
int report_stubs;
};
static const char EMPTY[sizeof(XXH64_hash_t)] = {0};
static struct wslhash_options_t g_options = {0};
static void free_all(void) {
const struct wslhash_filter_t *filter;
for (size_t i = 0; i < g_options.excludes_len; i++) {
filter = &g_options.excludes[i];
filter->fn_free(filter);
}
for (size_t i = 0; i < g_options.includes_len; i++) {
filter = &g_options.includes[i];
filter->fn_free(filter);
}
}
static int any_match(const struct wslhash_filter_t *filters, const size_t filter_len, const char *filename) {
const struct wslhash_filter_t *filter;
for (size_t i = 0; i < filter_len; i++) {
filter = &filters[i];
if (filter->fn_match(filter, filename)) {
DEBUG_PRINTF("File matched a filter '%s': %s\n", filter->name, filename);
return true;
}
}
return false;
}
static int is_filename_ok(const char *filename) {
DEBUG_PRINTF("Checking file: %s\n", filename);
if (g_options.excludes_len == 0 && g_options.includes_len == 0) {
return true;
}
if (g_options.excludes_len == 0) {
return any_match(g_options.includes, g_options.includes_len, filename);
}
if (g_options.includes_len == 0) {
return !any_match(g_options.excludes, g_options.excludes_len, filename);
}
return !any_match(g_options.excludes, g_options.excludes_len, filename) ||
any_match(g_options.includes, g_options.includes_len, filename);
}
static int is_dir(const char *path) {
struct stat stat_info = {0};
if (stat(path, &stat_info) != 0) {
return false;
}
return S_ISDIR(stat_info.st_mode);
}
static const char *filename(const char *fpath) {
const char *last_slash = strrchr(fpath, '/');
return (last_slash != NULL) ? last_slash + 1 : fpath;
}
// Called on each file
static int
process_file(const char *fpath, const struct stat *sb, int tflag, __attribute__((unused)) struct FTW *ftwbuf) {
DEBUG_PRINTF("Processing file: %s\n", fpath);
if (tflag != FTW_F && tflag != FTW_SL) {
DEBUG_PRINTF("Skipping: %s\n", fpath);
return 0; // Not a file
}
const char *fpath_relative = fpath + g_options.root_dir_len + 1; // remove first "/"
if (tflag == FTW_F) {
if (!is_filename_ok(filename(fpath))) {
DEBUG_PRINTF("Excluding file: %s\n", fpath);
if (g_options.report_stubs) {
printf("%s%c", fpath_relative, STUB_SEPARATOR);
}
return 0;
}
printf("%s%c", fpath_relative, FILE_SEPARATOR);
if (sb->st_size == 0 || g_options.skip_hash) {
// No need to calculate hash for empty file
fwrite(EMPTY, sizeof(EMPTY), 1, stdout);
return 0;
}
const int fd = open(fpath, O_RDONLY);
if (fd == -1) {
fprintf(stderr, "Can't open file %s", fpath);
perror("Can't open file");
exit(2);
}
// Mmap file and calculate hash
char *buffer = mmap(NULL, sb->st_size, PROT_READ, MAP_FILE | MAP_PRIVATE, fd, 0);
madvise(buffer, sb->st_size, MADV_SEQUENTIAL);
if (buffer == MAP_FAILED) {
fprintf(stderr, "Can't mmap file %s", fpath);
perror("Can't mmap file");
exit(3);
}
XXH64_hash_t hash = XXH64(buffer, sb->st_size, 0);
fwrite(&hash, sizeof(XXH64_hash_t), 1, stdout);
munmap(buffer, sb->st_size);
close(fd);
} else {
char real_path[PATH_MAX] = {0};
if (realpath(fpath, real_path) != NULL && is_dir(real_path)) {
printf("%s%c", fpath_relative, LINK_SEPARATOR);
const int32_t len = (int32_t) strlen(real_path);
fwrite(&len, sizeof(int32_t), 1, stdout);
fputs(real_path, stdout);
}
}
return 0;
}
static void rgx_init(struct wslhash_filter_t *self, const char *pattern_raw) {
regex_t *regex = calloc(1, sizeof(regex_t));
if (!regex) {
fprintf(stderr, "Calloc failed\n");
exit(EXIT_FAILURE);
}
if (regcomp(regex, pattern_raw, REG_EXTENDED)) {
fprintf(stderr, "Failed to compile basic regex: %s\n", pattern_raw);
exit(EXIT_FAILURE);
}
self->pattern = regex;
}
static void rgx_free(const struct wslhash_filter_t *self) {
regex_t *regex = self->pattern;
regfree(regex);
free(regex);
}
static int rgx_match(const struct wslhash_filter_t *self, const char *path) {
const regex_t *regex = self->pattern;
const int result = regexec(regex, path, 0, NULL, 0);
if (result == REG_OK) {
return true;
}
if (result == REG_NOMATCH) {
return false;
}
char buf[100] = {0};
regerror(result, regex, buf, sizeof(buf));
fprintf(stderr, "Regex match failed: %s\n", buf);
exit(EXIT_FAILURE);
}
static void parse_filter(const char *arg) {
char operator;
char matcher[FLT_MATCHER_LEN_MAX + 1] = {0};
char pattern_raw[FLT_PATTERN_LEN_MAX + 1] = {0};
if (sscanf(arg, FLT_SCAN_FMT, &operator, matcher, pattern_raw) < 3) {
fprintf(stderr, "Invalid filter format: %s\n", arg);
exit(EXIT_FAILURE);
}
struct wslhash_filter_t *filter;
if (operator == '-') {
if (g_options.excludes_len >= FLT_N_MAX) {
fprintf(stderr, "Too many exclude filters >%d\n", FLT_N_MAX);
exit(EXIT_FAILURE);
}
filter = &g_options.excludes[g_options.excludes_len++];
} else if (operator == '+') {
if (g_options.includes_len >= FLT_N_MAX) {
fprintf(stderr, "Too many include filters >%d\n", FLT_N_MAX);
exit(EXIT_FAILURE);
}
filter = &g_options.includes[g_options.includes_len++];
} else {
fprintf(stderr, "Unknown filter operator '%c': %s\n", operator, arg);
exit(EXIT_FAILURE);
}
if (strcmp(matcher, "rgx") == 0) {
filter->fn_init = rgx_init;
filter->fn_free = rgx_free;
filter->fn_match = rgx_match;
} else {
fprintf(stderr, "Unknown filter matcher '%s': %s\n", matcher, arg);
exit(EXIT_FAILURE);
}
strncpy(filter->name, arg, FLT_NAME_LEN_MAX);
filter->fn_init(filter, pattern_raw);
}
static void parse_args(int argc, char *argv[]) {
int c;
while ((c = getopt(argc, argv, "nsf:")) != -1) {
switch (c) {
case 's':
g_options.report_stubs = 1;
break;
case 'n':
g_options.skip_hash = 1;
break;
case 'f':
parse_filter(optarg);
break;
default:
fprintf(stderr, "Invalid options\n");
exit(EXIT_FAILURE);
}
}
if (optind >= argc) {
fprintf(stderr, "Dir is missing\n");
exit(EXIT_FAILURE);
}
const char *dir = argv[optind];
if (!is_dir(dir)) {
fprintf(stderr, "Provided path is not root_dir\n");
exit(2);
}
if (realpath(dir, g_options.root_dir) == NULL) {
fprintf(stderr, "realpath failed: %d", errno);
exit(-1);
}
g_options.root_dir_len = strlen(g_options.root_dir);
}
static int ensure_charset(void) {
setlocale(LC_CTYPE, "");
const char *charset = nl_langinfo(CODESET);
if (strncmp(charset, "UTF-8", sizeof "UTF-8") == 0) {
// Java side decodes output as UTF-8 and almost all WSL distros use UTF
return true;
}
if (strncmp(charset, "ASCII", sizeof "ASCII") == 0) {
// ASCII is 7 bit, so english texts could be decoded by java either
return true;
}
// Other charsets aren't used nor supported by WSL
fprintf(stderr, "Please use UTF-8 locale, not %s", charset);
return false;
}
int main(int argc, char *argv[]) {
if (!ensure_charset()) {
return -1;
}
parse_args(argc, argv);
// number of file descriptors is more or less random taken from example
// we don't know how many descriptors are available on the particular WSL, but sure not less than 20
if (nftw(g_options.root_dir, process_file, 20, FTW_MOUNT | FTW_PHYS) == -1) { // Walk through files, see nftw(3)
perror("nftw failed");
return 3;
}
free_all();
return EXIT_SUCCESS;
}