You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

692 lines
24 KiB

  1. #!/usr/bin/perl -w
  2. #
  3. # Copyright (C) 2016 Julian Andres Klode <jak@jak-linux.org>
  4. #
  5. # Permission is hereby granted, free of charge, to any person obtaining a copy
  6. # of this software and associated documentation files (the "Software"), to deal
  7. # in the Software without restriction, including without limitation the rights
  8. # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  9. # copies of the Software, and to permit persons to whom the Software is
  10. # furnished to do so, subject to the following conditions:
  11. #
  12. # The above copyright notice and this permission notice shall be included in
  13. # all copies or substantial portions of the Software.
  14. #
  15. # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16. # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17. # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  18. # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19. # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  20. # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  21. # THE SOFTWARE.
  22. =head1 NAME
  23. triehash - Generate a perfect hash function derived from a trie.
  24. =cut
  25. use strict;
  26. use warnings;
  27. use Getopt::Long;
  28. =head1 SYNOPSIS
  29. B<triehash> [S<I<option>>] [S<I<input file>>]
  30. =head1 DESCRIPTION
  31. triehash takes a list of words in input file and generates a function and
  32. an enumeration to describe the word
  33. =head1 INPUT FILE FORMAT
  34. The file consists of multiple lines of the form:
  35. [label ~ ] word [= value]
  36. This maps word to value, and generates an enumeration with entries of the form:
  37. label = value
  38. If I<label> is undefined, the word will be used, the minus character will be
  39. replaced by an underscore. If value is undefined it is counted upwards from
  40. the last value.
  41. There may also be one line of the format
  42. [ label ~] = value
  43. Which defines the value to be used for non-existing keys. Note that this also
  44. changes default value for other keys, as for normal entries. So if you place
  45. = 0
  46. at the beginning of the file, unknown strings map to 0, and the other strings
  47. map to values starting with 1. If label is not specified, the default is
  48. I<Unknown>.
  49. =head1 OPTIONS
  50. =over 4
  51. =item B<-C>I<.c file> B<--code>=I<.c file>
  52. Generate code in the given file.
  53. =item B<-H>I<header file> B<--header>=I<header file>
  54. Generate a header in the given file, containing a declaration of the hash
  55. function and an enumeration.
  56. =item B<--enum-name=>I<word>
  57. The name of the enumeration.
  58. =item B<--function-name=>I<word>
  59. The name of the function.
  60. =item B<--namespace=>I<name>
  61. Put the function and enum into a namespace (C++)
  62. =item B<--class=>I<name>
  63. Put the function and enum into a class (C++)
  64. =item B<--enum-class>
  65. Generate an enum class instead of an enum (C++)
  66. =item B<--counter-name=>I<name>
  67. Use I<name> for a counter that is set to the latest entry in the enumeration
  68. + 1. This can be useful for defining array sizes.
  69. =item B<--extern-c>
  70. Wrap everything into an extern "C" block. Not compatible with the C++
  71. options, as a header with namespaces, classes, or enum classes is not
  72. valid C.
  73. =item B<--multi-byte>=I<value>
  74. Generate code reading multiple bytes at once. The value is a string of power
  75. of twos to enable. The default value is 320 meaning that 8, 4, and single byte
  76. reads are enabled. Specify 0 to disable multi-byte completely, or add 2 if you
  77. also want to allow 2-byte reads. 2-byte reads are disabled by default because
  78. they negatively affect performance on older Intel architectures.
  79. This generates code for both multiple bytes and single byte reads, but only
  80. enables the multiple byte reads of GNU C compatible compilers, as the following
  81. extensions are used:
  82. =over 8
  83. =item Byte-aligned integers
  84. We must be able to generate integers that are aligned to a single byte using:
  85. typedef uint64_t __attribute__((aligned (1))) triehash_uu64;
  86. =item Byte-order
  87. The macros __BYTE_ORDER__ and __ORDER_LITTLE_ENDIAN__ must be defined.
  88. =back
  89. We forcefully disable multi-byte reads on platforms where the variable
  90. I<__ARM_ARCH> is defined and I<__ARM_FEATURE_UNALIGNED> is not defined,
  91. as there is a measurable overhead from emulating the unaligned reads on
  92. ARM.
  93. =item B<--language=>I<language>
  94. Generate a file in the specified language. Currently known are 'C' and 'tree',
  95. the latter generating a tree.
  96. =item B<--include=>I<header>
  97. Add the header to the include statements of the header file. The value must
  98. be surrounded by quotes or angle brackets for C code. May be specified multiple
  99. times.
  100. =back
  101. =cut
  102. my $unknown = -1;
  103. my $unknown_label = "Unknown";
  104. my $counter_start = 0;
  105. my $enum_name = "PerfectKey";
  106. my $function_name = "PerfectHash";
  107. my $enum_class = 0;
  108. my $code_name = "-";
  109. my $header_name = "-";
  110. my $code;
  111. my $header;
  112. my $ignore_case = 0;
  113. my $multi_byte = "320";
  114. my $language = 'C';
  115. my $counter_name = undef;
  116. my @includes = ();
  117. Getopt::Long::config('default',
  118. 'bundling',
  119. 'no_getopt_compat',
  120. 'no_auto_abbrev',
  121. 'permute',
  122. 'auto_help');
  123. GetOptions ("code|C=s" => \$code_name,
  124. "header|H=s" => \$header_name,
  125. "function-name=s" => \$function_name,
  126. "ignore-case" => \$ignore_case,
  127. "enum-name=s" => \$enum_name,
  128. "language|l=s" => \$language,
  129. "multi-byte=s" => \$multi_byte,
  130. "enum-class" => \$enum_class,
  131. "include=s" => \@includes,
  132. "counter-name=s" => \$counter_name)
  133. or die("Could not parse options!");
  134. # This implements a simple trie. Each node has three attributes:
  135. #
  136. # children - A hash of keys to other nodes
  137. # value - The value to be stored here
  138. # label - A named representation of the value.
  139. #
  140. # The key at each level of the trie can consist of one or more bytes, and the
  141. # trie can be normalized to a form where all keys at a level have the same
  142. # length using rebuild_tree().
  143. package Trie {
  144. sub new {
  145. my $class = shift;
  146. my $self = {};
  147. bless $self, $class;
  148. $self->{children} = {};
  149. $self->{value} = undef;
  150. $self->{label} = undef;
  151. return $self;
  152. }
  153. # Return the largest power of 2 smaller or equal to the argument
  154. sub alignpower2 {
  155. my ($self, $length) = @_;
  156. return 8 if ($length >= 8 && $multi_byte =~ /3/);
  157. return 4 if ($length >= 4 && $multi_byte =~ /2/);
  158. return 2 if ($length >= 2 && $multi_byte =~ /1/);
  159. return 1;
  160. }
  161. # Split the key into a head block and a tail
  162. sub split_key {
  163. my ($self, $key) = @_;
  164. my $length = length $key;
  165. my $split = $self->alignpower2($length);
  166. return (substr($key, 0, $split), substr($key, $split));
  167. }
  168. # Given a key, a label, and a value, insert that into the tree, possibly
  169. # replacing an existing node.
  170. sub insert {
  171. my ($self, $key, $label, $value) = @_;
  172. if (length($key) == 0) {
  173. $self->{label} = $label;
  174. $self->{value} = $value;
  175. return;
  176. }
  177. my ($child, $tail) = $self->split_key($key);
  178. $self->{children}{$child} = Trie->new if (!defined($self->{children}{$child}));
  179. $self->{children}{$child}->insert($tail, $label, $value);
  180. }
  181. # Construct a new trie that only contains words of a given length. This
  182. # is used to split up the common trie after knowing all words, so we can
  183. # switch on the expected word length first, and have the per-trie function
  184. # implement simple longest prefix matching.
  185. sub filter_depth {
  186. my ($self, $togo) = @_;
  187. my $new = Trie->new;
  188. if ($togo != 0) {
  189. my $found = 0;
  190. foreach my $key (sort keys %{$self->{children}}) {
  191. if ($togo > length($key) || defined $self->{children}{$key}->{value}) {
  192. my $child = $self->{children}{$key}->filter_depth($togo - length($key));
  193. $new->{children}{$key}= $child if defined $child;
  194. $found = 1 if defined $child;
  195. }
  196. }
  197. return undef if (!$found);
  198. } else {
  199. $new->{value} = $self->{value};
  200. $new->{label} = $self->{label};
  201. }
  202. return $new;
  203. }
  204. # (helper for rebuild_tree)
  205. # Reinsert all value nodes into the specified $trie, prepending $prefix
  206. # to their $paths.
  207. sub reinsert_value_nodes_into {
  208. my ($self, $trie, $prefix) = @_;
  209. $trie->insert($prefix, $self->{label}, $self->{value}) if (defined $self->{value});
  210. foreach my $key (sort keys %{$self->{children}}) {
  211. $self->{children}{$key}->reinsert_value_nodes_into($trie, $prefix . $key);
  212. }
  213. }
  214. # (helper for rebuild_tree)
  215. # Find the earliest point to split a key. Normally, we split at the maximum
  216. # power of 2 that is greater or equal than the length of the key. When we
  217. # are building an ASCII-optimised case-insensitive trie that simply ORs
  218. # each byte with 0x20, we need to split at the first ambiguous character:
  219. #
  220. # For example, the words a-bc and a\rbc are identical in such a situation:
  221. # '-' | 0x20 == '-' == '\r' | 0x20
  222. # We cannot simply switch on all 4 bytes at once, but need to split before
  223. # the ambiguous character so we can process the ambiguous character on its
  224. # own.
  225. sub find_ealier_split {
  226. my ($self, $key) = @_;
  227. if ($ignore_case) {
  228. for my $i (0..length($key)-1) {
  229. # If the key starts with an ambiguous character, we need to
  230. # take only it. Otherwise, we need to take everything
  231. # before the character.
  232. return $self->alignpower2($i || 1) if (main::ambiguous(substr($key, $i, 1)));
  233. }
  234. }
  235. return $self->alignpower2(length $key);
  236. }
  237. # This rebuilds the trie, splitting each key before ambiguous characters
  238. # as explained in find_earlier_split(), and then chooses the smallest
  239. # such split at each level, so that all keys at all levels have the same
  240. # length (so we can use a multi-byte switch).
  241. sub rebuild_tree {
  242. my $self = shift;
  243. # Determine if/where we need to split before an ambiguous character
  244. my $new_split = 99999999999999999;
  245. foreach my $key (sort keys %{$self->{children}}) {
  246. my $special_length = $self->find_ealier_split($key);
  247. $new_split = $special_length if ($special_length < $new_split);
  248. }
  249. # Start building a new uniform trie
  250. my $newself = Trie->new;
  251. $newself->{label} = $self->{label};
  252. $newself->{value} = $self->{value};
  253. $newself->{children} = {};
  254. foreach my $key (sort keys %{$self->{children}}) {
  255. my $head = substr($key, 0, $new_split);
  256. my $tail = substr($key, $new_split);
  257. # Rebuild the child node at $head, pushing $tail downwards
  258. $newself->{children}{$head} //= Trie->new;
  259. $self->{children}{$key}->reinsert_value_nodes_into($newself->{children}{$head}, $tail);
  260. # We took up to one special character of each key label. There might
  261. # be more, so we need to rebuild recursively.
  262. $newself->{children}{$head} = $newself->{children}{$head}->rebuild_tree();
  263. }
  264. return $newself;
  265. }
  266. }
  267. # Code generator for C and C++
  268. package CCodeGen {
  269. my $static = ($code_name eq $header_name) ? "static" : "";
  270. my $enum_specifier = $enum_class ? "enum class" : "enum";
  271. sub new {
  272. my $class = shift;
  273. my $self = {};
  274. bless $self, $class;
  275. return $self;
  276. }
  277. sub open_output {
  278. my $self = shift;
  279. if ($code_name ne "-") {
  280. open($code, '>', $code_name) or die "Cannot open $code_name: $!" ;
  281. } else {
  282. $code = *STDOUT;
  283. }
  284. if($code_name eq $header_name) {
  285. $header = $code;
  286. } elsif ($header_name ne "-") {
  287. open($header, '>', $header_name) or die "Cannot open $header_name: $!" ;
  288. } else {
  289. $header = *STDOUT;
  290. }
  291. }
  292. sub word_to_label {
  293. my ($class, $word) = @_;
  294. $word =~ s/_/__/g;
  295. $word =~ s/-/_/g;
  296. return $word;
  297. }
  298. # Return a case label, by shifting and or-ing bytes in the word
  299. sub case_label {
  300. my ($self, $key) = @_;
  301. return sprintf("'%s'", substr($key, 0, 1)) if not $multi_byte;
  302. my $output = '0';
  303. for my $i (0..length($key)-1) {
  304. $output .= sprintf("| onechar('%s', %d, %d)", substr($key, $i, 1), 8 * $i, 8*length($key));
  305. }
  306. return $output;
  307. }
  308. # Return an appropriate read instruction for $length bytes from $offset
  309. sub switch_key {
  310. my ($self, $offset, $length) = @_;
  311. return "string[$offset]" if $length == 1;
  312. return sprintf("*((triehash_uu%s*) &string[$offset])", $length * 8);
  313. }
  314. # Render the trie so that it matches the longest prefix.
  315. sub print_table {
  316. my ($self, $trie, $fh, $indent, $index) = @_;
  317. $indent //= 0;
  318. $index //= 0;
  319. # If we have children, try to match them.
  320. if (%{$trie->{children}}) {
  321. # The difference between lowercase and uppercase alphabetical characters
  322. # is that they have one bit flipped. If we have alphabetical characters
  323. # in the search space, and the entire search space works fine if we
  324. # always turn on the flip, just OR the character we are switching over
  325. # with the bit.
  326. my $want_use_bit = 0;
  327. my $can_use_bit = 1;
  328. my $key_length = 0;
  329. foreach my $key (sort keys %{$trie->{children}}) {
  330. $can_use_bit &= not main::ambiguous($key);
  331. $want_use_bit |= ($key =~ /^[a-zA-Z]+$/);
  332. $key_length = length($key);
  333. }
  334. if ($ignore_case && $can_use_bit && $want_use_bit) {
  335. printf $fh ((" " x $indent) . "switch(%s | 0x%s) {\n", $self->switch_key($index, $key_length), "20" x $key_length);
  336. } else {
  337. printf $fh ((" " x $indent) . "switch(%s) {\n", $self->switch_key($index, $key_length));
  338. }
  339. my $notfirst = 0;
  340. foreach my $key (sort keys %{$trie->{children}}) {
  341. if ($notfirst) {
  342. printf $fh (" " x $indent . " break;\n");
  343. }
  344. if ($ignore_case) {
  345. printf $fh (" " x $indent . "case %s:\n", $self->case_label(lc($key)));
  346. printf $fh (" " x $indent . "case %s:\n", $self->case_label(uc($key))) if lc($key) ne uc($key) && !($can_use_bit && $want_use_bit);
  347. } else {
  348. printf $fh (" " x $indent . "case %s:\n", $self->case_label($key));
  349. }
  350. $self->print_table($trie->{children}{$key}, $fh, $indent + 1, $index + length($key));
  351. $notfirst=1;
  352. }
  353. printf $fh (" " x $indent . "}\n");
  354. }
  355. # This node has a value, so it is a possible end point. If no children
  356. # matched, we have found our longest prefix.
  357. if (defined $trie->{value}) {
  358. printf $fh (" " x $indent . "return %s;\n", ($enum_class ? "${enum_name}::" : "").$trie->{label});
  359. }
  360. }
  361. sub print_words {
  362. my ($self, $trie, $fh, $indent, $sofar) = @_;
  363. $indent //= 0;
  364. $sofar //= "";
  365. printf $fh (" " x $indent."%s = %s,\n", $trie->{label}, $trie->{value}) if defined $trie->{value};
  366. foreach my $key (sort keys %{$trie->{children}}) {
  367. $self->print_words($trie->{children}{$key}, $fh, $indent, $sofar . $key);
  368. }
  369. }
  370. sub print_functions {
  371. my ($self, $trie, %lengths) = @_;
  372. foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
  373. print $code ("static enum ${enum_name} ${function_name}${local_length}(const char *string)\n");
  374. print $code ("{\n");
  375. $self->print_table($trie->filter_depth($local_length)->rebuild_tree(), $code, 1);
  376. printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
  377. print $code ("}\n");
  378. }
  379. }
  380. sub main {
  381. my ($self, $trie, $num_values, %lengths) = @_;
  382. print $header ("#ifndef TRIE_HASH_${function_name}\n");
  383. print $header ("#define TRIE_HASH_${function_name}\n");
  384. print $header ("#include <stddef.h>\n");
  385. print $header ("#include <stdint.h>\n");
  386. foreach my $include (@includes) {
  387. print $header ("#include $include\n");
  388. }
  389. printf $header ("enum { $counter_name = $num_values };\n") if (defined($counter_name));
  390. print $header ("${enum_specifier} ${enum_name} {\n");
  391. $self->print_words($trie, $header, 1);
  392. printf $header (" $unknown_label = $unknown,\n");
  393. print $header ("};\n");
  394. print $header ("$static enum ${enum_name} ${function_name}(const char *string, size_t length);\n");
  395. print $code ("#include \"$header_name\"\n") if ($header_name ne $code_name);
  396. if ($multi_byte) {
  397. print $code ("#ifdef __GNUC__\n");
  398. for (my $i=16; $i <= 64; $i *= 2) {
  399. print $code ("typedef uint${i}_t __attribute__((aligned (1))) triehash_uu${i};\n");
  400. print $code ("typedef char static_assert${i}[__alignof__(triehash_uu${i}) == 1 ? 1 : -1];\n");
  401. }
  402. print $code ("#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__\n");
  403. print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (s))\n");
  404. print $code ("#else\n");
  405. print $code ("#define onechar(c, s, l) (((uint64_t)(c)) << (l-8-s))\n");
  406. print $code ("#endif\n");
  407. print $code ("#if (!defined(__ARM_ARCH) || defined(__ARM_FEATURE_UNALIGNED)) && !defined(TRIE_HASH_NO_MULTI_BYTE)\n");
  408. print $code ("#define TRIE_HASH_MULTI_BYTE\n");
  409. print $code ("#endif\n");
  410. print $code ("#endif /*GNUC */\n");
  411. print $code ("#ifdef TRIE_HASH_MULTI_BYTE\n");
  412. $self->print_functions($trie, %lengths);
  413. $multi_byte = 0;
  414. print $code ("#else\n");
  415. $self->print_functions($trie, %lengths);
  416. print $code ("#endif /* TRIE_HASH_MULTI_BYTE */\n");
  417. } else {
  418. $self->print_functions($trie, %lengths);
  419. }
  420. print $code ("$static enum ${enum_name} ${function_name}(const char *string, size_t length)\n");
  421. print $code ("{\n");
  422. print $code (" switch (length) {\n");
  423. foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
  424. print $code (" case $local_length:\n");
  425. print $code (" return ${function_name}${local_length}(string);\n");
  426. }
  427. print $code (" default:\n");
  428. printf $code (" return %s$unknown_label;\n", ($enum_class ? "${enum_name}::" : ""));
  429. print $code (" }\n");
  430. print $code ("}\n");
  431. # Print end of header here, in case header and code point to the same file
  432. print $header ("#endif /* TRIE_HASH_${function_name} */\n");
  433. }
  434. }
  435. # A character is ambiguous if the 1<<5 (0x20) bit does not correspond to the
  436. # lower case bit. A word is ambiguous if any character is. This definition is
  437. # used to check if we can perform the |0x20 optimization when building a case-
  438. # insensitive trie.
  439. sub ambiguous {
  440. my $word = shift;
  441. foreach my $char (split //, $word) {
  442. # If 0x20 does not solely indicate lowercase, it is ambiguous
  443. return 1 if ord(lc($char)) != (ord($char) | 0x20);
  444. return 1 if ord(uc($char)) != (ord($char) & ~0x20);
  445. }
  446. return 0;
  447. }
  448. sub build_trie {
  449. my $codegen = shift;
  450. my $trie = Trie->new;
  451. my $counter = $counter_start;
  452. my %lengths;
  453. open(my $input, '<', $ARGV[0]) or die "Cannot open ".$ARGV[0].": $!";
  454. while (my $line = <$input>) {
  455. my ($label, $word, $value) = $line =~/\s*(?:([^~\s]+)\s*~)?(?:\s*([^~=\s]+)\s*)?(?:=\s*([^\s]+)\s+)?\s*/;
  456. if (defined $word) {
  457. $counter = $value if defined($value);
  458. $label //= $codegen->word_to_label($word);
  459. $trie->insert($word, $label, $counter);
  460. $lengths{length($word)} = 1;
  461. $counter++;
  462. } elsif (defined $value) {
  463. $unknown = $value;
  464. $unknown_label = $label if defined($label);
  465. $counter = $value + 1;
  466. } else {
  467. die "Invalid line: $line";
  468. }
  469. }
  470. return ($trie, $counter, %lengths);
  471. }
  472. # Generates an ASCII art tree
  473. package TreeCodeGen {
  474. sub new {
  475. my $class = shift;
  476. my $self = {};
  477. bless $self, $class;
  478. return $self;
  479. }
  480. sub word_to_label {
  481. my ($self, $word) = @_;
  482. return $word;
  483. }
  484. sub main {
  485. my ($self, $trie, $counter, %lengths) = @_;
  486. printf $code ("┌────────────────────────────────────────────────────┐\n");
  487. printf $code ("│ Initial trie │\n");
  488. printf $code ("└────────────────────────────────────────────────────┘\n");
  489. $self->print($trie);
  490. printf $code ("┌────────────────────────────────────────────────────┐\n");
  491. printf $code ("│ Rebuilt trie │\n");
  492. printf $code ("└────────────────────────────────────────────────────┘\n");
  493. $self->print($trie->rebuild_tree());
  494. foreach my $local_length (sort { $a <=> $b } (keys %lengths)) {
  495. printf $code ("┌────────────────────────────────────────────────────┐\n");
  496. printf $code ("│ Trie for words of length %-4d │\n", $local_length);
  497. printf $code ("└────────────────────────────────────────────────────┘\n");
  498. $self->print($trie->filter_depth($local_length)->rebuild_tree());
  499. }
  500. }
  501. sub open_output {
  502. my $self = shift;
  503. if ($code_name ne "-") {
  504. open($code, '>', $code_name) or die "Cannot open ".$ARGV[0].": $!" ;
  505. } else {
  506. $code = *STDOUT;
  507. }
  508. }
  509. # Print a trie
  510. sub print {
  511. my ($self, $trie, $depth) = @_;
  512. $depth //= 0;
  513. print $code (" → ") if defined($trie->{label});
  514. print $code ($trie->{label} // "", "\n");
  515. foreach my $key (sort keys %{$trie->{children}}) {
  516. print $code ("│ " x ($depth), "├── $key");
  517. $self->print($trie->{children}{$key}, $depth + 1);
  518. }
  519. }
  520. }
  521. my %codegens = (
  522. C => "CCodeGen",
  523. tree => "TreeCodeGen",
  524. );
  525. defined($codegens{$language}) or die "Unknown language $language. Valid choices: ", join(", ", keys %codegens);
  526. my $codegen = $codegens{$language}->new();
  527. my ($trie, $counter, %lengths) = build_trie($codegen);
  528. $codegen->open_output();
  529. $codegen->main($trie, $counter, %lengths);
  530. =head1 LICENSE
  531. triehash is available under the MIT/Expat license, see the source code
  532. for more information.
  533. =head1 AUTHOR
  534. Julian Andres Klode <jak@jak-linux.org>
  535. =cut