#include <ctype.h>
#include <math.h>
#include <stdarg.h>
#include <sys/types.h>
#include <time.h>
#include <dirent.h>
#include <errno.h>
#include <sys/stat.h>
#include "../cl/globals.h"
#include "../cl/macros.h"
#include "../cl/storage.h"
#include "../cl/lexhash.h"
#include "../cl/endian.h"
#include "../cl/attributes.h"
#include <sys/time.h>
#define FIELDSEPS "\t\n" |
Default string containing the characters that can function as field separators.
#define MAX_INPUT_LINE_LENGTH 65536 |
Input buffer size.
If we have XML tags with attributes, input lines can become pretty long (but there's basically just a single buffer)
#define MAXRANGES 1024 |
max number of s-attributes; also max number of p-attributes (-> could change this to implementation as a linked list)
Referenced by declare_range(), and parse_options().
#define POS_CORPUS "%s/%s.corpus" |
CL naming convention for P-attribute Corpus files.
Referenced by declare_wattr().
#define POS_LEX "%s/%s.lexicon" |
CL naming convention for P-attribute Lexicon files.
Referenced by declare_wattr().
#define POS_LEXIDX "%s/%s.lexicon.idx" |
CL naming convention for P-attribute Lexicon-index files.
Referenced by declare_wattr().
#define REP_CHECK_LEXHASH_SIZE 1000 |
nr of buckets of lexhashes used for checking duplicate errors (undeclared element and attribute names in XML tags)
Referenced by declare_range(), and main().
#define STRUC_AVS "%s/%s.avs" |
CL naming convention for S-attribute AVS files.
Referenced by declare_range().
#define STRUC_AVX "%s/%s.avx" |
CL naming convention for S-attribute AVX files.
Referenced by declare_range().
#define STRUC_RNG "%s/%s.rng" |
CL naming convention for S-attribute RNG files.
Referenced by declare_range().
#define UMASK 0644 |
User privileges of new files (octal format).
#define UNDEF_VALUE "__UNDEF__" |
Default string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty.
void addline | ( | char * | str | ) |
Processes a token data line.
str | A string containing the line to process. |
References cl_free, cl_lexhash_add(), cl_lexhash_id(), cl_make_set(), cl_strdup(), decode_entities(), error(), field_separators, _cl_lexhash_entry::id, MAX_LINE_LENGTH, my_strtok(), NwriteInt(), WAttr::position, print_input_line(), silent, strip_blanks, undef_value, wattr_ptr, and xml_aware.
Referenced by main().
void close_range | ( | Range * | rng, | |
int | end_pos | |||
) |
Closes a currently open instance of an S-attribute.
rng | Pointer to the S-attribute to close. | |
end_pos | The corpus position at which this instance closes. |
References Range::avs, Range::avx, cl_free, cl_lexhash_add(), cl_lexhash_find(), cl_strdup(), cl_string_list_get(), cl_string_list_size(), _cl_lexhash_entry::data, error(), Range::fd, _cl_lexhash_entry_data::integer, MAX_LINE_LENGTH, Range::name, Range::num, NwriteInt(), Range::offset, _cl_lexhash_entry_data::pointer, print_input_line(), silent, and Range::store_values.
Referenced by main(), and open_range().
Range* declare_range | ( | char * | name, | |
char * | directory, | |||
int | store_values, | |||
int | null_attribute | |||
) |
Creates a Range object to store a specified s-attribute (and, if appropriate, does the same for children-attributes).
name | The string from the user specifying the name of this attribute, recursion and any "attributes" of this XML element - e.g. "text:0+id" | |
directory | The directory where the CWB data files will go. | |
store_values | boolean: indicates whether this s-attribute was specified with -V (true) or -S (false) when the program was invoked. | |
null_attribute | boolean: this is a null attribute, i.e. an XML element to be ignored. |
References Range::avs, Range::avx, buf, cl_calloc(), cl_free, cl_lexhash_add(), cl_lexhash_id(), cl_new_lexhash(), cl_new_string_list(), cl_strdup(), cl_string_list_append(), _cl_lexhash_entry::data, debug, Range::dir, error(), Range::fd, MAX_LINE_LENGTH, MAXRANGES, Range::name, Range::num, Range::offset, _cl_lexhash_entry_data::pointer, range_ptr, REP_CHECK_LEXHASH_SIZE, Range::store_values, STRUC_AVS, STRUC_AVX, and STRUC_RNG.
Referenced by parse_options().
int declare_wattr | ( | char * | name, | |
char * | directory, | |||
int | nr_buckets | |||
) |
Sets up a new p-attribute, including opening corpus, lex and index file handles.
name | Identifier string of the p-attribute | |
directory | Directory in which CWB data files are to be created. | |
nr_buckets | Number of buckets in the lexhash of the new p-attribute (value passed to cl_new_lexhash() ) |
References cl_new_lexhash(), cl_strdup(), DEFAULT_ATT_NAME, error(), WAttr::feature_set, WAttr::lh, MAX_LINE_LENGTH, WAttr::name, POS_CORPUS, POS_LEX, POS_LEXIDX, WAttr::position, and wattr_ptr.
Referenced by parse_options().
char* decode_entities | ( | char * | s | ) |
Decode XML entities in a string.
This function decodes pre-defined XML entities in string s. It overwrites the input string s and also returns s for convenience.
(The entities are < > & " ').
If passed NULL, it will not fall over - it will just pass NULL back!
s | A string to decode. |
Referenced by addline(), and open_range().
void error | ( | char * | format, | |
... | ||||
) |
Prints an error message to STDERR, automatically adding a message on the location of the error in the corpus.
format | Format-specifying string of the error message. | |
... | Additional arguments, printf-style. |
References current_input_file, input_line, and print_input_line().
Referenced by addline(), call_dynamic_attribute(), close_range(), declare_range(), declare_wattr(), get_input_line(), main(), parse_options(), and scan_directory().
int find_range | ( | char * | name | ) |
Gets the index (in the ranges array) of a specified S-attribute.
name | The S-attribute to search for. |
References range_ptr.
Referenced by main(), and parse_options().
int find_wattr | ( | char * | name | ) |
Gets the index (in wattrs) of the P-attribute with the given name.
name | The P-attribute to search for. |
References wattr_ptr.
Referenced by parse_options().
int get_input_line | ( | char * | buffer, | |
int | bufsize | |||
) |
Reads one input line into the specified buffer (either from stdin, or from one or more input files).
The input files are not passed to the function, but are taken form the program global variables.
This function returns False when the last input file has been completely read, and automatically closes files.
buffer | Where to load the line to. | |
bufsize | Not currently used, but should be MAX_INPUT_LINE_LENGTH in case of future use! |
References cl_string_list_get(), current_input_file, current_input_file_name, error(), input_fd, input_file_is_pipe, input_line, MAX_INPUT_LINE_LENGTH, MAX_LINE_LENGTH, and nr_input_files.
Referenced by main().
int main | ( | int | argc, | |
char ** | argv | |||
) |
Main function for cwb-encode.
As well as the entry point to the program, this contains the main loop for each line of the corpus to be encoded.
The string of each line is sent to one of a number of different functions, depending on what is found in that string!
argc | Number of command-line arguments. | |
argv | Command-line arguments. |
References addline(), Range::avs, Range::avx, buf, cl_free, cl_lexhash_add(), cl_lexhash_freq(), cl_malloc(), cl_new_lexhash(), cl_new_string_list(), cl_set_debug_level(), cl_strdup(), cl_string_list_get(), cl_string_list_size(), close_range(), corpus_character_set, debug, directory, error(), Range::fd, find_range(), get_input_line(), input_line, line, MAX_INPUT_LINE_LENGTH, Range::name, nr_input_files, open_range(), parse_options(), print_input_line(), print_range_registry_line(), printtime(), progname, quote_file_path(), range_ptr, registry_file, REP_CHECK_LEXHASH_SIZE, silent, skip_empty_lines, Range::store_values, strip_blanks, verbose, wattr_ptr, and xml_aware.
char* my_strtok | ( | register char * | s, | |
register const char * | delim | |||
) |
void open_range | ( | Range * | rng, | |
int | start_pos, | |||
char * | annot | |||
) |
Opens an instance of the given S-attribute.
If rng has element attribute children, open_range() will mess around with the string annotation (otherwise not).
rng | The S-attribute to open. | |
start_pos | The corpus position at which this instance begins. | |
annot | The annotation string (the XML element's att-val pairs). |
References cl_free, cl_lexhash_add(), cl_lexhash_find(), cl_lexhash_freq(), cl_make_set(), cl_strdup(), cl_string_list_get(), cl_string_list_size(), close_range(), _cl_lexhash_entry::data, decode_entities(), _cl_lexhash_entry_data::integer, line, Range::name, _cl_lexhash_entry_data::pointer, print_input_line(), silent, Range::store_values, and strip_blanks.
Referenced by main(), and write_region_to_disk().
void parse_options | ( | int | argc, | |
char ** | argv | |||
) |
Parses program options and sets global variables.
References cl_charset_name_canonical(), cl_delete_string_list(), cl_string_list_append(), cl_string_list_get(), cl_string_list_size(), corpus_character_set, debug, declare_range(), declare_wattr(), DEFAULT_ATT_NAME, directory, error(), find_range(), find_wattr(), MAXRANGES, progname, range_ptr, registry_file, scan_directory(), silent, skip_empty_lines, strip_blanks, undef_value, usage(), verbose, wattr_ptr, and xml_aware.
void print_input_line | ( | void | ) |
Prints the input line (and input file, if applicable) on STDERR, for error messages and warnings.
References current_input_file_name, input_line, and nr_input_files.
Referenced by addline(), close_range(), error(), main(), and open_range().
void print_range_registry_line | ( | Range * | rng, | |
FILE * | fd, | |||
int | print_comment | |||
) |
Prints registry lines for a given s-attribute, and its children, if any, to the specified file handle.
rng | The s-attribute in question. | |
fd | File handle for the registry file. | |
print_comment | Boolean: if true, a comment on the original XML tags is printed. |
References cl_lexhash_find(), cl_string_list_get(), cl_string_list_size(), _cl_lexhash_entry::data, Range::name, _cl_lexhash_entry_data::pointer, and Range::store_values.
Referenced by main().
void printtime | ( | FILE * | stream, | |
char * | msg | |||
) |
Prints a message plus the current time to the specified file/stream.
stream | Stream to print to. | |
msg | Message to incorporate into the string that is printed. |
Referenced by main().
char* quote_file_path | ( | char * | path | ) |
Add quotes and escape slashes to a path name if necessary.
This is for the HOME and INFO fields of the registry file.
For consistency, this function always returns a newly allocated string, regardless of whether changes have been made.
path | String containing the path to quotify. |
References cl_malloc(), and cl_strdup().
Referenced by main().
cl_string_list scan_directory | ( | char * | dir | ) |
Get a list of files in a given directory.
This function only lists files with .vrt or .vrt.gz extensions, and only files identified by POSIX stat() as "regular".
dir | Path of directory to look in. |
References cl_free, cl_malloc(), cl_new_string_list(), cl_string_list_append(), cl_string_list_qsort(), and error().
Referenced by parse_options().
void usage | ( | void | ) |
Prints a usage message and exits the program.
References progname, and undef_value.
char* corpus_character_set = "latin1" |
character set label that is inserted into the registry file
Referenced by main(), and parse_options().
int current_input_file = 0 |
index of input file currently being processed
Referenced by error(), and get_input_line().
char* current_input_file_name = NULL |
filename of current input file, for error messages
Referenced by get_input_line(), and print_input_line().
int debug = 0 |
debug mode on or off?
char* directory = NULL |
corpus data directory (no longer defaults to current directory)
Referenced by main(), and parse_options().
char* field_separators = FIELDSEPS |
string containing the characters that can function as field separators
Referenced by addline().
FILE* input_fd = NULL |
file handle for current input file (or pipe)
int input_file_is_pipe = 0 |
so we can properly close input_fd using either fclose() or pclose()
Referenced by get_input_line().
cl_string_list input_files = NULL |
list of input file (-f option(s))
int input_line = 0 |
input line number (reset for each new file) for error messages
Referenced by error(), get_input_line(), load_macro_file(), main(), and print_input_line().
int line = 0 |
corpus position currently being encoded (ie cpos of _next_ token)
Referenced by compose_kwic_line(), do_undump(), evaluate_subset(), evaluate_target(), FreeConcordanceLine(), goodbye(), html_print_output(), latex_print_output(), load_macro_file(), main(), open_range(), print_next_region(), PrintAttributes(), sgml_print_output(), skip_next_region(), and SortExternally().
int nr_input_files = 0 |
number of input files (length of list after option processing)
Referenced by get_input_line(), main(), and print_input_line().
char* progname = NULL |
name of the currently running program
int range_ptr = 0 |
Referenced by declare_range(), find_range(), main(), and parse_options().
char* registry_file = NULL |
if set, auto-generate registry file named {registry_file}, listing declared attributes
Referenced by main(), and parse_options().
int silent = 0 |
hide messages
int skip_empty_lines = 0 |
skip empty lines when encoding?
int strip_blanks = 0 |
strip leading and trailing blanks from input and token annotations
cl_lexhash undeclared_sattrs = NULL |
lookup hash for undeclared s-attributes and s-attributes declared with -S that have annotations (which will be ignored), so warnings are issued only once
char* undef_value = UNDEF_VALUE |
string used as value of P-attributes when a value is missing ie if a tab-delimited field is empty
Referenced by addline(), parse_options(), and usage().
int verbose = 0 |
show progress (this is _not_ the opposite of silent!)
int wattr_ptr = 0 |
Referenced by addline(), declare_wattr(), find_wattr(), main(), and parse_options().
int xml_aware = 0 |
substitute XML entities in p-attributes & ignore <? and <! lines