#include "../cl/globals.h"
#include "../cl/cl.h"
#define DEFAULT_BUCKETS 1000000 |
use 1 million buckets by default
Referenced by main().
#define MAX_N 32 |
maximum value of N (makes life a little easier)
Referenced by main().
typedef struct _hash_entry * HashEntry |
Structure representing hash entries.
void add_key | ( | char * | key | ) |
Adds a key to global variable Hash.
key | String specifying the key (passed by main() from a command-line argument) |
References _Hash::att, ATT_POS, ATT_STRUC, buf, check_words, cl_corpus_charset(), cl_id2str, cl_max_struc(), cl_new_attribute, cl_new_regex(), cl_regex2id, cl_struc2str, cl_struc_values, _Hash::constraint_ok, corpname, _Hash::current_struc, _Hash::end_cpos, Hash, _Hash::id_list, _Hash::id_list_size, IGNORE_CASE, IGNORE_DIAC, _Hash::is_constraint, is_regular(), _Hash::is_structural, _Hash::K, MAX_LINE_LENGTH, _Hash::max_offset, _Hash::N, _Hash::offset, _Hash::regex, _Hash::source_base, and _Hash::start_cpos.
Referenced by main().
int find_prime | ( | int | n | ) |
Finds a prime number.
Returns smallest prime >= n.
n | lower bound for the generated prime. |
References is_prime().
int get_next_range | ( | int * | start, | |
int * | end | |||
) |
Reads the next range of corpus positions.
The ranges of corpus positions are taken either from global settings (-s, -e) or from a specified file (-R).
start | Where to put the start of the next range. | |
end | Where to put the end of the next range. |
References global_end, global_start, and ranges_fh.
Referenced by main().
void hash_add | ( | int * | tuple, | |
int | f | |||
) |
Inserts an N-tuple into the global hash.
If the N-tuple is already in the hash, its count is incremented by f, but nothing is inserted.
tuple | The tuple to add (array of ints). | |
f | The frequency of the tuple. |
References cl_malloc(), _hash_entry::freq, Hash, hash_find(), _Hash::K, _hash_entry::next, _Hash::table, and _hash_entry::tuple.
Referenced by main().
HashEntry hash_find | ( | int * | tuple, | |
int * | R_index | |||
) |
Finds an N-tuple in the global hash.
tuple | The tuple to search for. | |
R_index | The index of the bucket containing the located HashEntry. |
References _Hash::buckets, Hash, hash_index(), _Hash::K, _hash_entry::next, _Hash::table, _hash_entry::tuple, and tuples_eq().
Referenced by hash_add().
unsigned int hash_index | ( | int | N, | |
int * | tuple | |||
) |
Computes a hash index for an N-tuple of ints.
N | Size of the tuple. | |
tuple | The tuple itself: an array of ints. |
Referenced by hash_find().
int is_letter | ( | unsigned char | c | ) |
Checks whether a character is a letter.
c | The character to check. |
Referenced by is_regular().
int is_prime | ( | int | n | ) |
Checks whether a number is prime.
Returns True iff n is a prime.
n | number to check |
int is_regular | ( | char * | s | ) |
Check regularity of a token.
A token is "regular" if it contains only letters, numbers and dashes (with no dash at the end).
"Regularity" is used as a filter on the corpus iff the -C option is specified.
Character encoding: Latin-1.
s | String containing the token to check. |
References is_letter().
int main | ( | int | argc, | |
char * | argv[] | |||
) |
Main function for cwb-scan-corpus.
argc | Number of command-line arguments. | |
argv | Command-line arguments. |
References add_key(), _Hash::att, ATT_POS, _Hash::buckets, check_words, cl_calloc(), cl_cpos2id, cl_id2str, cl_malloc(), cl_max_cpos, cl_max_id, cl_max_struc(), cl_new_attribute, cl_new_corpus, cl_regex_match(), cl_standard_registry, cl_struc2cpos, cl_struc2str, _Hash::constraint_ok, corpname, _Hash::current_struc, DEFAULT_BUCKETS, _Hash::end_cpos, find_prime(), _hash_entry::freq, _Hash::frequency, frequency_att, frequency_threshold, _Hash::frequency_values, get_next_range(), global_end, global_start, Hash, hash_add(), _Hash::id_list, _Hash::id_list_size, _Hash::is_constraint, is_regular(), _Hash::is_structural, _Hash::K, MAX_LINE_LENGTH, MAX_N, _Hash::max_offset, _Hash::N, _hash_entry::next, _Hash::offset, output_file, parse_options(), progname, quiet, ranges_fh, ranges_file, reg_dir, _Hash::regex, _Hash::source_base, _Hash::start_cpos, _Hash::table, _hash_entry::tuple, usage(), _Hash::virtual_id, and word.
int parse_options | ( | int | argc, | |
char * | argv[] | |||
) |
Parses the command-line options of the program.
References _Hash::buckets, check_words, frequency_att, frequency_threshold, global_end, global_start, Hash, output_file, quiet, ranges_file, reg_dir, and usage().
int tuples_eq | ( | int | N, | |
int * | t1, | |||
int * | t2 | |||
) |
Compares two N-tuples for equality.
N | Size of the tuple. | |
t1 | First tuple (array of ints of size N). | |
t2 | Second tuple (array of ints of size N). |
Referenced by hash_find().
void usage | ( | void | ) |
Prints a usage message and exits the program.
References progname.
corpus we're working on
Referenced by regex2dfa(), and WriteStates().
int check_words = 0 |
if set, accept only 'regular' words in frequency counts
Referenced by add_key(), main(), and parse_options().
char* frequency_att = NULL |
p-attribute with frequency entries for corpus rows (when abusing corpus as frequency database)
Referenced by main(), and parse_options().
int frequency_threshold = 0 |
frequency threshold for result table (-f option)
Referenced by main(), and parse_options().
int global_end = -1 |
will be set up in main() unless changed with -e switch.
Referenced by get_next_range(), main(), and parse_options().
int global_start = 0 |
start scanning at this cpos (defaults to start of corpus)
Referenced by get_next_range(), main(), and parse_options().
struct _Hash Hash |
A specialised hash for computing frequency distributions over tuples of lexicon IDs.
Referenced by add_key(), hash_add(), hash_find(), LookUp(), main(), MakeExp(), and parse_options().
char* output_file = NULL |
output file name (-o option)
char* progname = NULL |
name of this program (from shell command)
int quiet = 0 |
if set, don't show progress information on stderr
Referenced by cqp_parse_file(), main(), and parse_options().
FILE* ranges_fh = NULL |
corresponding filehandle
Referenced by get_next_range(), and main().
char* ranges_file = NULL |
file with ranges to scan (pairs of corpus positions)
Referenced by main(), and parse_options().
char* reg_dir = NULL |
registry directory (NULL -> use default)
Referenced by main(), and parse_options().