CWB
|
#define DEFAULT_BUCKETS 1000000 |
use 1 million buckets by default
Referenced by main().
#define MAX_N 32 |
maximum value of N (makes life a little easier)
Referenced by main(), and scancorpus_usage().
typedef struct _hash_entry * HashEntry |
Structure representing hash entries.
int find_prime | ( | int | n | ) |
Finds a prime number.
Returns smallest prime >= n.
n | lower bound for the generated prime. |
References is_prime().
int get_next_range | ( | int * | start, |
int * | end | ||
) |
Reads the next range of corpus positions.
The ranges of corpus positions are taken either from global settings (-s, -e) or from a specified file (-R).
start | Where to put the start of the next range. |
end | Where to put the end of the next range. |
References CL_MAX_LINE_LENGTH, global_end, global_start, and ranges_fh.
Referenced by main().
void hash_add | ( | int * | tuple, |
int | f | ||
) |
Inserts an N-tuple into the global hash.
If the N-tuple is already in the hash, its count is incremented by f, but nothing is inserted.
tuple | The tuple to add (array of ints). |
f | The frequency of the tuple. |
References cl_malloc(), _hash_entry::freq, Hash, hash_find(), _Hash::K, _hash_entry::next, _Hash::table, and _hash_entry::tuple.
Referenced by main().
HashEntry hash_find | ( | int * | tuple, |
int * | R_index | ||
) |
Finds an N-tuple in the global hash.
tuple | The tuple to search for. |
R_index | The index of the bucket containing the located HashEntry. |
References _Hash::buckets, Hash, hash_index(), _Hash::K, _hash_entry::next, _Hash::table, _hash_entry::tuple, and tuples_eq().
Referenced by hash_add().
unsigned int hash_index | ( | int | N, |
int * | tuple | ||
) |
Computes a hash index for an N-tuple of ints.
N | Size of the tuple. |
tuple | The tuple itself: an array of ints. |
Referenced by hash_find().
int is_letter | ( | unsigned char | c | ) |
Checks whether a character is a letter in Latin-1.
This function is no longer used, it is not multi-charset-safe.
c | The character to check. |
Referenced by is_regular().
int is_prime | ( | int | n | ) |
Checks whether a number is prime.
Returns True iff n is a prime.
n | number to check |
int is_regular | ( | char * | s | ) |
Check regularity of a token.
A token is "regular" if it contains only letters, numbers and dashes (with no dash at the end).
"Regularity" is used as a filter on the corpus iff the -C option is specified.
s | String containing the token to check. |
References cl_regex_match(), and is_letter().
Referenced by main(), and scancorpus_add_key().
int main | ( | int | argc, |
char * | argv[] | ||
) |
Main function for cwb-scan-corpus.
argc | Number of command-line arguments. |
argv | Command-line arguments. |
References _Hash::att, ATT_POS, _Hash::buckets, TCorpus::charset, check_words, cl_calloc(), cl_cpos2id(), cl_id2str(), cl_malloc(), cl_max_cpos(), cl_max_id(), CL_MAX_LINE_LENGTH, cl_max_struc(), cl_new_attribute, cl_new_corpus(), cl_new_regex(), cl_regex_match(), cl_standard_registry(), cl_struc2cpos(), cl_struc2str(), _Hash::constraint_ok, corpname, _Hash::current_struc, DEFAULT_BUCKETS, _Hash::end_cpos, find_prime(), _hash_entry::freq, _Hash::frequency, frequency_att, frequency_threshold, _Hash::frequency_values, get_next_range(), global_end, global_start, Hash, hash_add(), _Hash::id_list, _Hash::id_list_size, IGNORE_DIAC, _Hash::is_constraint, _Hash::is_negated, is_regular(), _Hash::is_structural, _Hash::K, MAX_N, _Hash::max_offset, _Hash::N, _hash_entry::next, _Hash::offset, output_file, progname, quiet, ranges_fh, ranges_file, reg_dir, _Hash::regex, scancorpus_add_key(), scancorpus_parse_options(), scancorpus_usage(), _Hash::source_base, _Hash::start_cpos, _Hash::table, _hash_entry::tuple, utf8, _Hash::virtual_id, and word.
void scancorpus_add_key | ( | char * | key | ) |
Adds a key to global variable Hash.
key | String specifying the key (passed by main() from a command-line argument) |
References _Hash::att, ATT_POS, ATT_STRUC, buf, check_words, cl_corpus_charset(), cl_id2str(), CL_MAX_LINE_LENGTH, cl_max_struc(), cl_new_attribute, cl_new_regex(), cl_regex2id(), cl_struc2str(), cl_struc_values(), _Hash::constraint_ok, corpname, _Hash::current_struc, _Hash::end_cpos, Hash, _Hash::id_list, _Hash::id_list_size, IGNORE_CASE, IGNORE_DIAC, _Hash::is_constraint, _Hash::is_negated, is_regular(), _Hash::is_structural, _Hash::K, _Hash::max_offset, _Hash::N, _Hash::offset, _Hash::regex, _Hash::source_base, and _Hash::start_cpos.
Referenced by main().
int scancorpus_parse_options | ( | int | argc, |
char * | argv[] | ||
) |
Parses the command-line options of the program.
References _Hash::buckets, check_words, frequency_att, frequency_threshold, global_end, global_start, Hash, output_file, quiet, ranges_file, reg_dir, and scancorpus_usage().
Referenced by main().
void scancorpus_usage | ( | void | ) |
Prints a usage message and exits the program.
References MAX_N, and VERSION.
Referenced by main(), and scancorpus_parse_options().
int tuples_eq | ( | int | N, |
int * | t1, | ||
int * | t2 | ||
) |
Compares two N-tuples for equality.
N | Size of the tuple. |
t1 | First tuple (array of ints of size N). |
t2 | Second tuple (array of ints of size N). |
Referenced by hash_find().
corpus we're working on
Referenced by regex2dfa(), and WriteStates().
int check_words = 0 |
if set, accept only 'regular' words in frequency counts
Referenced by main(), scancorpus_add_key(), and scancorpus_parse_options().
char* corpname = NULL |
corpus name (command-line)
Referenced by main(), and scancorpus_add_key().
char* frequency_att = NULL |
p-attribute with frequency entries for corpus rows (when abusing corpus as frequency database)
Referenced by main(), and scancorpus_parse_options().
int frequency_threshold = 0 |
frequency threshold for result table (-f option)
Referenced by main(), and scancorpus_parse_options().
int global_end = -1 |
will be set up in main() unless changed with -e switch.
Referenced by get_next_range(), main(), and scancorpus_parse_options().
int global_start = 0 |
start scanning at this cpos (defaults to start of corpus)
Referenced by get_next_range(), main(), and scancorpus_parse_options().
struct _Hash Hash |
Referenced by hash_add(), hash_find(), LookUp(), main(), MakeExp(), scancorpus_add_key(), and scancorpus_parse_options().
char* output_file = NULL |
output file name (-o option)
char* progname = NULL |
name of this program (from shell command)
int quiet = 0 |
if set, don't show progress information on stderr
Referenced by cqp_parse_file(), main(), and scancorpus_parse_options().
FILE* ranges_fh = NULL |
corresponding filehandle
Referenced by get_next_range(), and main().
char* ranges_file = NULL |
file with ranges to scan (pairs of corpus positions)
Referenced by main(), and scancorpus_parse_options().
char* reg_dir = NULL |
registry directory (NULL -> use default)
Referenced by main(), and scancorpus_parse_options().
CL_Regex regular_rx = NULL |
regex object for use when check_words is true.