cl.h File Reference

#include <strings.h>

Data Structures

Defines

Typedefs

Enumerations

Functions

Variables


Define Documentation

#define ATT_ALIGN   (1<<2)
#define ATT_ALL   ATT_POS | ATT_STRUC | ATT_ALIGN | ATT_DYN

shorthand for "all types of attribute"

#define ATT_DYN   (1<<6)
#define ATT_NONE   0
#define ATT_POS   (1<<0)
#define ATT_REAL   ATT_POS | ATT_STRUC | ATT_ALIGN

shorthand for "all types of attribute except dynamic"

#define ATT_STRUC   (1<<1)
#define ATTAT_FLOAT   5
#define ATTAT_INT   3
#define ATTAT_NONE   0
#define ATTAT_PAREF   6
#define ATTAT_POS   1
#define ATTAT_STRING   2
#define ATTAT_VAR   4
#define CDA_EALIGN   -9

no alignment at position

Referenced by cdperror_string(), cl_cpos2alg(), and get_extended_alignment().

#define CDA_EARGS   -12

error in arguments for dynamic call

Referenced by call_dynamic_attribute(), and cdperror_string().

#define CDA_EATTTYPE   -2

function was called on illegal attribute

Referenced by cdperror_string(), and send_cl_error().

#define CDA_EBADREGEX   -16

bad regular expression

Referenced by cdperror_string(), cl_new_regex(), collect_matching_ids(), and send_cl_error().

#define CDA_EBUFFER   -18

buffer overflow (hard-coded internal buffer sizes)

Referenced by cdperror_string(), and cl_set_intersection().

#define CDA_EFSETINV   -17

invalid feature set format

Referenced by cdperror_string(), cl_make_set(), cl_set_intersection(), and cl_set_size().

#define CDA_EIDORNG   -3
#define CDA_EIDXORNG   -5
#define CDA_EINTERNAL   -19

internal data consistency error (really bad)

Referenced by cdperror_string(), and structure_value().

#define CDA_ENODATA   -11
#define CDA_ENOMEM   -13

memory fault [unused]

Referenced by cdperror_string(), and send_cl_error().

#define CDA_ENOSTRING   -6

no such string encoded

Referenced by cdperror_string(), and get_id_of_string().

#define CDA_ENULLATT   -1

NULL passed as attribute argument.

Referenced by cdperror_string().

#define CDA_ENYI   -15

not yet implemented

Referenced by cdperror_string(), get_sortidxpos_of_id(), and send_cl_error().

#define CDA_EOTHER   -14
#define CDA_EPATTERN   -7

illegal pattern

Referenced by cdperror_string(), and send_cl_error().

#define CDA_EPOSORNG   -4
#define CDA_EREMOTE   -10

error in remote access

Referenced by cdperror_string().

#define CDA_ESTRUC   -8

no structure at position

Referenced by cdperror_string(), cl_cpos2boundary(), get_num_of_struc(), and get_struc_attribute().

#define CDA_OK   0
#define cl_cpos2id ( a,
cpos   )     get_id_at_position(a, cpos)
#define cl_cpos2str ( a,
cpos   )     get_string_at_position(a, cpos)
#define cl_cpos2struc2cpos ( a,
cpos,
start,
end   )     get_struc_attribute(a, cpos, start, end)
#define cl_cpos2struc2str ( a,
cpos   )     structure_value_at_position(a, cpos)

Referenced by get_group_id(), and print_tabulation().

#define cl_delete_attribute (  )     attr_drop_attribute(a)

Referenced by cqi_drop_attribute().

#define cl_delete_corpus (  )     drop_corpus(c)

Referenced by main().

#define cl_delete_stream ( ps   )     ClosePositionStream(ps)
#define CL_DYN_STRING_SIZE   2048

maximum size of 'dynamic' strings

Referenced by call_predefined_function(), and cl_set_intersection().

#define cl_errno   cderrno
#define cl_error ( message   )     cdperror(message)

Referenced by main().

#define cl_error_string ( no   )     cdperror_string(no)
#define cl_free (  )     do { if ((p) != NULL) { free(p); p = NULL; } } while (0)

Safely frees memory.

See also:
cl_malloc
Parameters:
p Pointer to memory to be freed.

Referenced by add_hosts_in_subnet_to_list(), addline(), after_Query(), assign_temp_to_sub(), attach_subcorpus(), attr_drop_attribute(), check_set(), cl_delete_int_list(), cl_delete_lexhash(), cl_delete_lexhash_entry(), cl_delete_regex(), cl_delete_string_list(), cl_free_string_list(), cl_lexhash_check_grow(), cl_make_set(), cl_new_regex(), close_range(), collect_matching_ids(), comp_drop_component(), creat_rev_corpus(), cwbci_check_line(), declare_range(), delete_interval(), delete_intervals(), DestroyAttributeList(), do_AddSubVariables(), do_cqi_cl_cpos2lbound(), do_cqi_cl_cpos2rbound(), do_cqi_cl_cpos2struc(), do_cqi_cqp_fdist_1(), do_cqi_cqp_fdist_2(), do_flagged_re_variable(), do_IDReference(), do_LabelReference(), do_SearchPattern(), do_StandardQuery(), do_undump(), do_XMLTag(), drop_corpus(), drop_mapping(), drop_single_mapping(), DropVariable(), evaltree2searchstr(), evaluate_target(), execute_side_effects(), expand_macro(), free_booltree(), free_environment(), free_group(), free_matchlist(), free_tabulation_list(), FreeIDList(), FreeSortClause(), get_fulllocalpath(), get_matched_corpus_positions(), get_positions(), initialize_cl(), load_macro_file(), MacroHashDelete(), main(), matchfirstpattern(), meet_mu(), open_input_stream(), open_range(), OptimizeStringConstraint(), parse_line(), print_tabulation(), RangeSetop(), RangeSort(), RecomputeAL(), RemoveNameFromAL(), scan_directory(), set_context_option_value(), set_corpus_matchlists(), set_target(), Setop(), setup_corpus(), SL_delete(), SortExternally(), SortSubcorpus(), SortSubcorpusRandomize(), split_subcorpus_spec(), Unchain(), validate_revcorp(), VariableDeleteItems(), VariableSubtractItem(), VerifyList(), and VerifyVariable().

#define cl_id2all ( a,
sid,
freq,
len   )     get_id_info(a, sid, freq, len)
#define cl_id2cpos ( a,
id,
freq   )     get_positions(a, id, freq, NULL, 0)

Referenced by do_cqi_cl_id2cpos().

#define cl_id2freq ( a,
id   )     get_id_frequency(a, id)
#define cl_id2sort ( a,
id   )     get_sortidxpos_of_id(a, id)
#define cl_id2str ( a,
id   )     get_string_of_id(a, id)
#define cl_id2strlen ( a,
id   )     get_id_string_len(a, id)

Referenced by create_feature_maps().

#define cl_idlist2cpos ( a,
idlist,
idlist_size,
sort,
size   )     collect_matches(a, idlist, idlist_size, sort, size, NULL, 0)

Referenced by do_cqi_cl_idlist2cpos().

#define cl_idlist2freq ( a,
list,
size   )     cumulative_id_frequency(a, list, size)
#define cl_index_compressed (  )     inverted_file_is_compressed(a)
#define cl_max_cpos (  )     get_attribute_size(a)
#define cl_max_id (  )     get_id_range(a)
#define cl_new_attribute ( c,
name,
type   )     find_attribute(c, name, type, NULL)
#define cl_new_corpus ( reg,
name   )     setup_corpus(reg, name)

Referenced by main(), and parse_options().

#define cl_new_stream ( a,
id   )     OpenPositionStream(a, id)
#define cl_read_stream ( ps,
buf,
size   )     ReadPositionStream(ps, buf, size)
#define cl_regex2id ( a,
re,
flags,
size   )     collect_matching_ids(a, re, flags, size)
#define cl_sequence_compressed (  )     item_sequence_is_compressed(a)
#define cl_sort2id ( a,
sid   )     get_id_from_sortidx(a, sid)
 
#define cl_standard_registry (  )     central_corpus_directory()

Referenced by find_corpus(), and main().

#define cl_str2id ( a,
str   )     get_id_of_string(a, str)
#define cl_struc2cpos ( a,
struc,
start,
end   )     get_bounds_of_nth_struc(a, struc, start, end)
#define cl_struc2str ( a,
struc   )     structure_value(a, struc)
#define cl_struc_values (  )     structure_has_values(a)
#define IGNORE_CASE   1

Flag ignore-case in regular expression matching.

See also:
cl_regex2id

Referenced by add_key(), cl_new_regex(), cl_string_maptable(), collect_matching_ids(), main(), print_pattern(), and setup_corpus().

#define IGNORE_DIAC   2

Flag ignore-diacritics in regular expression matching.

See also:
cl_regex2id

Referenced by add_key(), cl_new_regex(), cl_string_maptable(), collect_matching_ids(), main(), and print_pattern().

#define IGNORE_REGEX   4

Flag for: don't use regular expression matching - match as a literal string.

Not used in the CL but in use in CQP.

Referenced by do_flagged_re_variable(), do_flagged_string(), do_mval_string(), do_XMLTag(), and print_pattern().

#define STRUC_INSIDE   1

cl_cpos2boundary() return flag: specified position is WITHIN a region of this s-attribute

Referenced by cl_cpos2boundary().

#define STRUC_LBOUND   2

cl_cpos2boundary() return flag: specified position is AT THE START BOUNDARY OF a region of this s-attribute

Referenced by cl_cpos2boundary().

#define STRUC_RBOUND   4

cl_cpos2boundary() return flag: specified position is AT THE END BOUNDARY OF a region of this s-attribute

Referenced by cl_cpos2boundary().


Typedef Documentation

typedef union _Attribute Attribute

The Attribute object: an entire segment of a corpus (of any flavour; s, p etc).

typedef struct _CL_BitVec* CL_BitVec

The CL_BitVec object: doesn't seem to exist {???-- AH}.

typedef struct _cl_int_list* cl_int_list

automatically growing list of integers (just what you always need .

..)

typedef struct _cl_lexhash* cl_lexhash

The cl_lexhash class (lexicon hashes, with IDs and frequency counts).

A "lexicon hash" links strings to integers. Each cl_lexhash object represents an entire table of such things; individual string-to-int links are represented by cl_lexhash_entry objects.

Within the cl_lexhash, the entries are grouped into buckets. A bucket is the term for a "slot" on the hash table. The linked-list in a given bucket represent all the different string-keys that map to one particular index value.

Each entry contains the key itself (for search-and-retrieval), the frequency of that type (incremented when a token is added that is already in the lexhash), an ID integer, plus a bundle of "data" associated with that string.

These lexicon hashes are used, notably, in the encoding of corpora to CWB-index-format.

Underlying structure for the cl_lexhash_entry class.

typedef struct _CL_Regex* CL_Regex

The CL_Regex object: an optimised regular expression.

automatically growing list of strings (just what you always need .

..)

typedef struct TCorpus Corpus

The Corpus object: contains information on a loaded corpus.

Identifier for one of the character sets supported by CWB.

typedef struct TCorpusProperty * CorpusProperty

The CorpusProperty object.

The underlying structure takes the form of a linked-list entry.

Each Corpus object has, as one of its members, the head entry on a list of CorpusProperties.

typedef struct _DCR DynCallResult

The DynCallResult object (needed to allocate space for dynamic function arguments).

The PositionStream object: gives stream-like reading of an Attribute.


Enumeration Type Documentation

Identifier for one of the character sets supported by CWB.

Enumerator:
ascii 
latin1 
latin2 
latin3 
latin4 
cyrillic 
arabic 
greek 
hebrew 
latin5 
latin6 
latin7 
latin8 
latin9 
utf8 
unknown_charset 

Function Documentation

int attr_drop_attribute ( Attribute attribute  ) 
int call_dynamic_attribute ( Attribute attribute,
DynCallResult dcr,
DynCallResult args,
int  nr_args 
)

Calls a dynamic attribute.

This is the attribute access function for dynamic attributes.

Parameters:
attribute The (dynamic) attribute in question.
dcr Location for the result (*int or *char).
args Location of the parameters (of *int or *char).
nr_args Number of parameters.
Returns:
Boolean: True for all OK, false for error.

References Dynamic_Attribute::arglist, ATT_DYN, ATTAT_FLOAT, ATTAT_INT, ATTAT_NONE, ATTAT_PAREF, ATTAT_POS, ATTAT_STRING, ATTAT_VAR, Dynamic_Attribute::call, CDA_EARGS, CDA_OK, cderrno, _DCR::charres, check_arg, cl_strdup(), _Attribute::dyn, error(), _DCR::floatres, _DCR::intres, _DynArg::next, Dynamic_Attribute::res_type, _DCR::type, _DynArg::type, and _DCR::value.

Referenced by get_leaf_value().

void cdperror ( char *  message  ) 

Prints an error message, together with a string identifying the current error number.

References cderrno, and cdperror_string().

Referenced by compress_reversed_index(), compute_code_lengths(), decompress_check_reversed_index(), do_show(), print_info(), and show_position_values().

char* cdperror_string ( int  errno  ) 

Gets a string describing the error identified by an error number.

Parameters:
errno Error number integer (a CDA_* constant as defined in cl.h)

References CDA_EALIGN, CDA_EARGS, CDA_EATTTYPE, CDA_EBADREGEX, CDA_EBUFFER, CDA_EFSETINV, CDA_EIDORNG, CDA_EIDXORNG, CDA_EINTERNAL, CDA_ENODATA, CDA_ENOMEM, CDA_ENOSTRING, CDA_ENULLATT, CDA_ENYI, CDA_EOTHER, CDA_EPATTERN, CDA_EPOSORNG, CDA_EREMOTE, CDA_ESTRUC, and CDA_OK.

Referenced by cdperror(), ensure_corpus_size(), and OptimizeStringConstraint().

char* central_corpus_directory (  ) 

Gets a string containing the path of the default registry directory.

Returns:
The value of the corpus-module-internal variable regdir, which is initialised from the environment variable REGISTRY_ENVVAR or, failing that, the macro REGISTRY_DEFAULT_PATH.

References regdir, REGISTRY_DEFAULT_PATH, and REGISTRY_ENVVAR.

Referenced by load_corpusnames(), main(), and setup_corpus().

int cl_alg2cpos ( Attribute attribute,
int  alg,
int *  source_region_start,
int *  source_region_end,
int *  target_region_start,
int *  target_region_end 
)

Gets the corpus positions of an alignment on the given align-attribute.

Note that four corpus positions are retrieved, into the addresses given as parameters.

Parameters:
attribute The align-attribute to look on.
alg The ID of the alignment whose positions are wanted.
source_region_start Location to put source corpus start position.
source_region_end Location to put source corpus end position.
target_region_start Location to put target corpus start position.
target_region_end Location to put target corpus end position.
Returns:
Boolean: true = all OK, false = problem.

References CDA_EIDXORNG, CDA_ENODATA, CDA_OK, cderrno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by check_alignment_constraints(), compose_kwic_line(), do_cqi_cl_alg2cpos(), printAlignedStrings(), and show_position_values().

void* cl_calloc ( size_t  nr_of_elements,
size_t  element_size 
)

safely allocates memory calloc-style.

See also:
cl_malloc
Parameters:
nr_of_elements Number of elements to allocate
element_size Size of each element
Returns:
Pointer to the block of allocated memory

Referenced by alloc_mblob(), cl_new_int_list(), cl_new_lexhash(), cl_new_string_list(), collect_matching_ids(), compute_code_lengths(), declare_range(), evaluate_target(), main(), and validate_revcorp().

CorpusCharset cl_charset_from_name ( char *  name  ) 

Gets a CorpusCharset enumeration with the id code for the given string.

References _charset_spec::name, and unknown_charset.

Referenced by add_corpus_property(), and cwbci_parse_options().

char* cl_charset_name ( CorpusCharset  id  ) 

Gets a string containing the name of the specified CorpusCharset character set object.

Note that returned string cannot be modified.

References _charset_spec::name.

Referenced by corpus_info().

char* cl_charset_name_canonical ( char *  name_to_check  ) 

Checks whether a string represents a valid charset, and returns a pointer to the name in canonical form (ie lacking any non-standard case there may be in the input string).

Parameters:
name_to_check String containing the character set name to be checked
Returns:
Pointer to canonical-form string for that charset's name or NULL if name_to_check cannot be linked to a valid charset.

References _charset_spec::name.

Referenced by cwbci_parse_options(), and parse_options().

CorpusCharset cl_corpus_charset ( Corpus corpus  ) 

Retrieves the special 'charset' property.

Parameters:
corpus The corpus object from which to retrieve the charset
Returns:
The character set (as a CorpusCharset object).

References TCorpus::charset.

Referenced by add_key(), and print_xml_declaration().

char* cl_corpus_property ( Corpus corpus,
char *  property 
)

Gets the value of the specified corpus property.

Parameters:
corpus Pointer to the Corpus object.
property Name of the property to retrieve.
Returns:
Pointer to string that contains the value of the property, or NULL if the specified property is undefined for this Corpus object.

References cl_first_corpus_property(), cl_next_corpus_property(), TCorpusProperty::property, and TCorpusProperty::value.

Referenced by add_corpus_property(), and corpus_info().

int cl_cpos2alg ( Attribute attribute,
int  cpos 
)

Gets the id number of the alignment at the specified corpus position.

Parameters:
attribute The align-attribute to look on.
cpos The corpus position to look at.
Returns:
The id number of the alignment at this position, or a negative int error code.

References CDA_EALIGN, CDA_ENODATA, CDA_EPOSORNG, CDA_OK, cderrno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, TMblob::data, TComponent::data, ensure_component(), get_alignment(), get_extended_alignment(), and TComponent::size.

Referenced by check_alignment_constraints(), compose_kwic_line(), do_cqi_cl_cpos2alg(), printAlignedStrings(), and show_position_values().

int cl_cpos2boundary ( Attribute a,
int  cpos 
)

Compares the location of a corpus position to the regions of an s-attribute.

This determines whether the specified corpus position is within a region (i.e. a structure, an instance of that s-attribute) on the given s-attribute; and/or on a boundary; or outside a region.

See also:
STRUC_INSIDE
STRUC_LBOUND
STRUC_RBOUND
Parameters:
a The s-attribute on which to search.
cpos The corpus position to look for.
Returns:
0 if this position is outside a region; some combination of flags if it is within a region or on a bound; or a negative number (error code) in case of error.

References CDA_ESTRUC, cderrno, cl_cpos2struc2cpos, STRUC_INSIDE, STRUC_LBOUND, and STRUC_RBOUND.

int cl_cpos2struc ( Attribute a,
int  cpos 
)

Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position.

Parameters:
a The s-attribute on which to search.
cpos The corpus position to look for.
Returns:
The number of the structure that is found.

References cderrno, and get_num_of_struc().

Referenced by compose_kwic_line(), do_cqi_cl_cpos2lbound(), do_cqi_cl_cpos2rbound(), do_cqi_cl_cpos2struc(), eval_constraint(), get_position_values(), main(), show_position_values(), and showSurroundingStructureValues().

void cl_delete_int_list ( cl_int_list  l  ) 

Deletes a cl_int_list object.

References cl_free, and _cl_int_list::data.

void cl_delete_lexhash ( cl_lexhash  hash  ) 

Deletes a cl_lexhash object.

This deletes all the entries in all the buckets in the lexhash, plus the cl_lexhash itself.

Parameters:
hash The cl_lexhash to delete.

References _cl_lexhash::buckets, cl_delete_lexhash_entry(), cl_free, _cl_lexhash_entry::next, and _cl_lexhash::table.

Referenced by main().

void cl_delete_regex ( CL_Regex  rx  ) 

Deletes a CL_Regex object.

Parameters:
rx The CL_Regex to delete.

References _CL_Regex::buffer, cl_free, _CL_Regex::grain, _CL_Regex::grains, and _CL_Regex::iso_string.

Referenced by collect_matching_ids(), free_booltree(), and free_environment().

void cl_delete_string_list ( cl_string_list  l  ) 

Deletes a cl_string_list object.

References cl_free, and _cl_string_list::data.

Referenced by cl_make_set(), main(), and parse_options().

CorpusProperty cl_first_corpus_property ( Corpus corpus  ) 

Gets the first entry in this corpus's list of properties.

(The corpus properties iterator / property datatype is public.)

Parameters:
corpus Pointer to the Corpus object.
Returns:
The first property.

References TCorpus::properties.

Referenced by cl_corpus_property(), and corpus_info().

void cl_free_string_list ( cl_string_list  l  ) 

Frees all the strings in the cl_string_list object.

References cl_free, _cl_string_list::data, and _cl_string_list::size.

Referenced by main().

void cl_get_rng_state ( unsigned int *  i1,
unsigned int *  i2 
)

Reads current state of CL-internal random number generator.

The integers currently held in RNG_I1 and RNG_I2 are written to the two memory locations supplied as arguments.

Parameters:
i1 Target location for the value of RNG_I1
i2 Target location for the value of RNG_I2

References RNG_I1, and RNG_I2.

int cl_has_extended_alignment ( Attribute attribute  ) 

Checks whether an attribute's XALIGN component exists, that is, whether or not it has extended alignment.

Parameters:
attribute An align-attribute.
Returns:
Boolean.

References ATT_ALIGN, cderrno, check_arg, component_state(), ComponentLoaded, ComponentUnloaded, and CompXAlignData.

Referenced by cl_alg2cpos(), cl_cpos2alg(), cl_max_alg(), and show_statistics().

void cl_int_list_append ( cl_int_list  l,
int  val 
)

Appends an integer to the end of a cl_int_list object.

References cl_int_list_set(), and _cl_int_list::size.

int cl_int_list_get ( cl_int_list  l,
int  n 
)

Retrieves an element from a cl_int_list object.

Parameters:
l The list to search.
n The element to retrieve.
Returns:
The n'th integer on the list, or 0 if there is no n'th integer.

References _cl_int_list::data, and _cl_int_list::size.

void cl_int_list_lumpsize ( cl_int_list  l,
int  s 
)

Sets the lumpsize of a cl_int_list object.

See also:
LUMPSIZE
Parameters:
l The cl_int_list.
s The new lumpsize.

References _cl_int_list::lumpsize, and LUMPSIZE.

void cl_int_list_qsort ( cl_int_list  l  ) 

Sorts a cl_int_list object.

The list of integers are sorted into ascending order.

References cl_int_list_intcmp(), _cl_int_list::data, and _cl_int_list::size.

void cl_int_list_set ( cl_int_list  l,
int  n,
int  val 
)

Sets an integer on a cl_int_list object.

The n'th element on the list is set to val, and the list is auto-extended if necessary.

References _cl_int_list::allocated, cl_realloc(), _cl_int_list::data, _cl_int_list::lumpsize, and _cl_int_list::size.

Referenced by cl_int_list_append().

int cl_int_list_size ( cl_int_list  l  ) 

Gets the current size of a cl_int_list object (number of elements on the list).

References _cl_int_list::size.

cl_lexhash_entry cl_lexhash_add ( cl_lexhash  hash,
char *  token 
)

Adds a token to a cl_lexhash table.

If the string is already in the hash, its frequency count is increased by 1.

Otherwise, a new entry is created, with an auto-assigned ID; note that the string is duplicated, so the original string that is passed to this function does not need ot be kept in memory.

Parameters:
hash The hash table to add to.
token The string to add.
Returns:
A pointer to a (new or existing) entry

References cl_lexhash_find_i(), cl_malloc(), cl_strdup(), _cl_lexhash_entry::data, _cl_lexhash::entries, _cl_lexhash_entry::freq, _cl_lexhash_entry::id, _cl_lexhash_entry_data::integer, _cl_lexhash_entry::key, _cl_lexhash_entry::next, _cl_lexhash::next_id, _cl_lexhash_entry_data::numeric, _cl_lexhash_entry_data::pointer, and _cl_lexhash::table.

Referenced by addline(), close_range(), declare_range(), main(), open_range(), and write_region_to_disk().

void cl_lexhash_auto_grow ( cl_lexhash  hash,
int  flag 
)

Turns a cl_lexhash's ability to autogrow on or off.

When this setting is switched on, the lexhash will grow automatically to avoid performance degradation.

Note the default value for this setting is SWITCHED ON.

See also:
cl_lexhash_check_grow
Parameters:
hash The hash that will be affected.
flag New value for autogrow setting: boolean where true is on and false is off.

References _cl_lexhash::auto_grow.

int cl_lexhash_del ( cl_lexhash  hash,
char *  token 
)

Deletes a string from a hash.

The entry corresponding to the specified string is removed from the lexhash. If the string is not in the lexhash to begin with, no action is taken.

Parameters:
hash The hash to alter.
token The string to remove.
Returns:
The frequency of the deleted entry.

References cl_delete_lexhash_entry(), cl_lexhash_find_i(), _cl_lexhash::entries, _cl_lexhash_entry::freq, _cl_lexhash_entry::next, and _cl_lexhash::table.

cl_lexhash_entry cl_lexhash_find ( cl_lexhash  hash,
char *  token 
)

Finds the entry corresponding to a particular string within a cl_lexhash.

Parameters:
hash The hash to search.
token The key-string to look for.
Returns:
The entry that is found (or NULL if the string is not in the hash).

References cl_lexhash_find_i().

Referenced by close_range(), main(), open_range(), print_range_registry_line(), and write_region_to_disk().

int cl_lexhash_freq ( cl_lexhash  hash,
char *  token 
)

Gets the frequency of a particular string within a lexhash.

Parameters:
hash The hash to look in.
token The string to look for.
Returns:
The frrequency of that string, or 0 if the string is not in the hash (whgich is, of course, actually its frequency).

References cl_lexhash_find_i(), and _cl_lexhash_entry::freq.

Referenced by main(), and open_range().

int cl_lexhash_id ( cl_lexhash  hash,
char *  token 
)

Gets the ID of a particular string within a lexhash.

Note this is the ID integer that identifies THAT PARTICULAR STRING, not the hash value of that string - which only identifies the bucket the string is found in!

Parameters:
hash The hash to look in.
token The string to look for.
Returns:
The ID code of that string, or -1 if the string is not in the hash.

References cl_lexhash_find_i(), and _cl_lexhash_entry::id.

Referenced by addline(), and declare_range().

void cl_lexhash_set_cleanup_function ( cl_lexhash  lh,
void(*)(cl_lexhash_entry func 
)
int cl_lexhash_size ( cl_lexhash  hash  ) 

Gets the number of different strings stored in a lexhash.

This returns the total number of entries in all the bucket linked-lists in the whole hashtable.

Parameters:
hash The hash to size up.

References _cl_lexhash::buckets, _cl_lexhash_entry::next, and _cl_lexhash::table.

char* cl_make_set ( char *  s,
int  split 
)

Generates a set attribute value.

Parameters:
s The input string.
split Boolean; if True, s is split on whitespace. If False, the function expects input in '|'-delimited format.
Returns:
The set attribute value in standard syntax ('|' delimited, sorted with cl_strcmp). If there is any syntax error, cl_make_set() returns NULL.

References CDA_EFSETINV, CDA_OK, cderrno, cl_delete_string_list(), cl_free, cl_malloc(), cl_new_string_list(), cl_strdup(), cl_string_list_append(), cl_string_list_get(), cl_string_list_qsort(), and cl_string_list_size().

Referenced by addline(), check_set(), and open_range().

void* cl_malloc ( size_t  bytes  ) 

safely allocates memory malloc-style.

This function allocates a block of memory of the requested size, and does a test for malloc() failure which aborts the program and prints an error message if the system is out of memory. So the return value of this function can be used without further testing for malloc() failure.

Parameters:
bytes Number of bytes to allocate
Returns:
Pointer to the block of allocated memory

Referenced by accessible(), add_corpus_property(), add_grant_to_last_user(), add_host_to_list(), add_hosts_in_subnet_to_list(), add_tabular_pattern(), add_to_string(), add_user_to_list(), AddNameToAL(), alloc_mblob(), Allocate(), attach_subcorpus(), binsert_g(), check_alignment_constraints(), cl_lexhash_add(), cl_make_set(), cl_new_int_list(), cl_new_lexhash(), cl_new_regex(), cl_new_string_list(), cl_string_latex2iso(), collect_matches(), collect_matching_ids(), combine_subcorpus_spec(), compute_code_lengths(), compute_grouping(), ComputeGroupExternally(), cqi_read_bool_list(), cqi_read_byte_list(), cqi_read_int_list(), cqi_read_string(), cqi_read_string_list(), cqp_run_mu_query(), cqp_run_tab_query(), creat_rev_corpus(), creat_rev_corpus_idx(), create_bitfield(), define_macro(), do_cqi_cqp_query(), do_flagged_re_variable(), do_MeetStatement(), do_mval_string(), do_undump(), do_UnionStatement(), do_XMLTag(), duplicate_corpus(), evaltree2searchstr(), find_corpus_registry(), get_leaf_value(), get_matched_corpus_positions(), get_positions(), GetVariableItems(), GetVariableStrings(), hash_add(), labellookup(), list_macros(), macro_iterator_next_prototype(), MacroAddSegment(), MacroHashAdd(), main(), make_attribute_hash(), make_first_tabular_pattern(), make_temp_corpus(), MakeMacroHash(), mallocfile(), matchfirstpattern(), meet_mu(), mval_string_conversion(), new_reftab(), new_symbol_table(), new_tabulation_item(), NewAttributeList(), NewContextDescriptor(), NewVariable(), open_input_stream(), OptimizeStringConstraint(), parse_macro_name(), PushInputBuffer(), quote_file_path(), RangeSetop(), RangeSort(), read_mapping(), ReadHCD(), scan_directory(), set_corpus_matchlists(), set_target(), Setop(), show_corpora_files1(), simulate_dfa(), SL_insert_after_point(), SortExternally(), SortSubcorpus(), SortSubcorpusRandomize(), strdupto(), try_optimization(), and VariableAddItem().

int cl_max_alg ( Attribute attribute  ) 

Gets the id number of alignments on this align-attribute.

This is equal to the maximum alignment on this attribute.

Parameters:
attribute An align-attribute.
Returns:
The number of alignments on this attribute.

References CDA_ENODATA, CDA_OK, cderrno, cl_has_extended_alignment(), CompAlignData, CompXAlignData, ensure_component(), and TComponent::size.

Referenced by do_cqi_cl_attribute_size(), and show_statistics().

int cl_max_struc ( Attribute a  ) 

Gets the maximum for this S-attribute (ie the size of the S-attribute).

The result of this function is equal to the number of instances of this s-attribute in the corpus.

The s-attribute to evaluate.

Returns:
The maximum corpus position, or an error code (if less than 0)

References cderrno, and get_nr_of_strucs().

Referenced by add_key(), compose_kwic_line(), do_cqi_cl_attribute_size(), main(), matchfirstpattern(), and show_statistics().

cl_int_list cl_new_int_list ( void   ) 
cl_lexhash cl_new_lexhash ( int  buckets  ) 

Creates a new cl_lexhash object.

Parameters:
buckets The number of buckets in the newly-created cl_lexhash; set to 0 to use the default number of buckets.
Returns:
The new cl_lexhash.

References _cl_lexhash::auto_grow, _cl_lexhash::buckets, cl_calloc(), cl_malloc(), _cl_lexhash::cleanup_func, _cl_lexhash::comparisons, DEFAULT_NR_OF_BUCKETS, _cl_lexhash::entries, find_prime(), _cl_lexhash::last_performance, _cl_lexhash::next_id, PERFORMANCE_COUNT, _cl_lexhash::performance_counter, and _cl_lexhash::table.

Referenced by cl_lexhash_check_grow(), declare_range(), declare_wattr(), main(), and write_region_to_disk().

CL_Regex cl_new_regex ( char *  regex,
int  flags,
CorpusCharset  charset 
)

Create a new CL_regex object (ie a regular expression buffer).

The regular expression is preprocessed according to the flags, and anchored to the start and end of the string. (That is, ^ is added to the start, $ to the end.)

Then the resulting regex is compiled (using POSIX compilation) and optimised. Currently the character set parameter is ignored and assumed to be Latin-1.

Parameters:
regex String containing the regular expression
flags IGNORE_CASE, or IGNORE_DIAC, or both, or 0.
charset The character set of the regex. Currently ignored.
Returns:
The new CL_Regex object, or NULL in case of error.

References _CL_Regex::anchor_end, _CL_Regex::anchor_start, _CL_Regex::buffer, CDA_EBADREGEX, CDA_OK, cderrno, _CL_Regex::charset, cl_debug, cl_free, cl_malloc(), cl_regex_error, cl_regopt_analyse(), cl_regopt_anchor_end, cl_regopt_anchor_start, cl_regopt_grain, cl_regopt_grain_len, cl_regopt_grains, cl_regopt_jumptable, cl_strdup(), cl_string_canonical(), cl_string_latex2iso(), _CL_Regex::flags, _CL_Regex::grain, _CL_Regex::grain_len, _CL_Regex::grains, IGNORE_CASE, IGNORE_DIAC, _CL_Regex::iso_string, _CL_Regex::jumptable, and MAX_LINE_LENGTH.

Referenced by add_key(), collect_matching_ids(), do_flagged_string(), and do_XMLTag().

cl_string_list cl_new_string_list ( void   ) 
CorpusProperty cl_next_corpus_property ( CorpusProperty  prop  ) 

Gets the next corpus property on the list of properties.

(The corpus properties iterator / property datatype is public.)

Parameters:
prop The current property.
Returns:
The next property on the list.

References TCorpusProperty::next.

Referenced by cl_corpus_property(), and corpus_info().

unsigned int cl_random ( void   ) 

Gets a random number.

Part of the CL-internal random number generator.

Returns:
The random number, an unsigned 32-bit integer with uniform distribution

References RNG_I1, and RNG_I2.

Referenced by cl_runif(), and SortSubcorpusRandomize().

void cl_randomize ( void   ) 

Initialises the CL-internal random number generator from the current system time.

References cl_set_seed().

Referenced by cqp_randomize(), initialize_cqp(), and main().

void* cl_realloc ( void *  block,
size_t  bytes 
)

safely reallocates memory.

See also:
cl_malloc
Parameters:
block Pointer to the block to be reallocated
bytes Number of bytes to allocate to the resized memory block @ return Pointer to the block of reallocated memory

Referenced by add_to_string(), binsert_g(), cl_int_list_set(), cl_string_list_set(), ComputeGroupExternally(), ComputeGroupInternally(), get_positions(), load_macro_file(), meet_mu(), NewVariable(), RangeSetop(), read_mapping(), Reallocate(), Setop(), and VariableAddItem().

int cl_regex_match ( CL_Regex  rx,
char *  str 
)

Matches a regular expression against a string.

The regular expression contained in the CL_Regex is compared to the string. No settings or flags are passed to this function; rather, the settings that rx was created with are used.

Parameters:
rx The regular expression to match.
str The string to compare the regex to.
Returns:
Boolean: true if the regex matched, otherwise false.

References _CL_Regex::anchor_end, _CL_Regex::anchor_start, _CL_Regex::buffer, cl_string_canonical(), _CL_Regex::flags, _CL_Regex::grain, _CL_Regex::grain_len, _CL_Regex::grains, _CL_Regex::iso_string, and _CL_Regex::jumptable.

Referenced by eval_bool(), eval_constraint(), main(), and matchfirstpattern().

int cl_regex_optimised ( CL_Regex  rx  ) 

Finds the level of optimisation of a CL_Regex.

This function returns the approximate level of optimisation, computed from the ratio of grain length to number of grains (0 = no grains, ergo not optimised at all).

Parameters:
rx The CL_Regex to check.
Returns:
0 if rx is not optimised; otherwise an integer indicating optimisation level.

References _CL_Regex::grain_len, and _CL_Regex::grains.

Referenced by collect_matching_ids().

double cl_runif ( void   ) 

Gets a random number in the range [0,1] with uniform distribution.

Part of the CL-internal random number generator.

Returns:
The generated random number.

References cl_random().

Referenced by do_cqi_cqp_query(), and do_reduce().

void cl_set_debug_level ( int  level  ) 

Sets the debug level configuration variable.

See also:
cl_debug

References cl_debug.

Referenced by execute_side_effects(), main(), parse_options(), and set_default_option_values().

int cl_set_intersection ( char *  result,
const char *  s1,
const char *  s2 
)

Computes the intersection of two set attribute values.

Compute intersection of two set attribute values (in standard syntax, i.e. sorted and '|'-delimited); memory for the result string must be allocated by the caller.

Returns:
0 on error, 1 otherwise

References CDA_EBUFFER, CDA_EFSETINV, CDA_OK, cderrno, CL_DYN_STRING_SIZE, and cl_strcmp().

Referenced by call_predefined_function().

void cl_set_memory_limit ( int  limit  ) 

Sets the memory limit.

NOTE name of parameter differs here and in cl.h -- TODO

See also:
cl_memory_limit

References cl_memory_limit.

Referenced by main().

void cl_set_optimize ( int  state  ) 

Turns optimization on or off.

See also:
cl_optimize
Parameters:
state Boolean (true turns it on, false turns it off).

References cl_optimize.

Referenced by execute_side_effects(), main(), and set_default_option_values().

void cl_set_rng_state ( unsigned int  i1,
unsigned int  i2 
)

Restores the state of the CL-internal random number generator.

Parameters:
i1 The value to set the first RNG integer to (if zero, resets it to 1)
i2 The value to set the second RNG integer to (if zero, resets it to 1)

References RNG_I1, and RNG_I2.

Referenced by cl_set_seed(), and SortSubcorpusRandomize().

void cl_set_seed ( unsigned int  seed  ) 

Initialises the CL-internal random number generator.

Parameters:
seed A single 32bit number to use as the seed

References cl_set_rng_state().

Referenced by cl_randomize().

int cl_set_size ( char *  s  ) 

Counts the number of elements in a set attribute value.

This function counts the number of elements in a set attribute value (using '|'-delimited standard syntax);

Returns:
-1 on error (in particular, if set is malformed)

References CDA_EFSETINV, CDA_OK, and cderrno.

Referenced by call_predefined_function().

int cl_strcmp ( char *  s1,
char *  s2 
)

CL internal string comparison (uses signed char on all platforms).

Referenced by cl_set_intersection(), cl_string_list_strcmp(), get_id_of_string(), and scompare().

char* cl_strdup ( char *  string  ) 
void cl_string_canonical ( char *  s,
int  flags 
)

Converts a string to canonical form.

The "canonical form" of a string is for use in comparisons where case-insensitivity and/or diacritic insensitivity is desired.

Note that the string s is modified in place.

Parameters:
s The string (must be Latin-1!)
flags The flags that specify which conversions are required. Can be IGNORE_CASE and/or IGNORE_DIAC.

References cl_string_maptable(), and latin1.

Referenced by cl_new_regex(), cl_regex_match(), collect_matching_ids(), main(), print_tabulation(), setup_corpus(), and SortSubcorpus().

char* cl_string_latex2iso ( char *  str,
char *  result,
int  target_len 
)

Converts strings with latex-style blackslash escapes for accented characters to ISO-8859-1 (Latin-1).

Syntax:

\[AaOoUus..] --> corresponding ISO 8859-1 character

octal} --> ISO 8859-1 character

Parameters:
str The string to convert.
result The location to put the altered string (which should be shorter, or at least no longer than, the input string. If this parameter is NULL, space is automatically allocated for the output. result is allowed to be the same as str.
target_len The maximum length of the target string. If result is NULL, then this is set automatically.
Returns:
Pointer to the altered string (if result was NULL you need to catch this and free it when no longer needed).

See also:
cl_string_latex2iso
cl_string_latex2iso

References cl_malloc(), popc, and pushc.

Referenced by cl_new_regex(), do_flagged_string(), do_SetVariableValue(), and do_XMLTag().

void cl_string_list_append ( cl_string_list  l,
char *  val 
)

Appends a string pointer to the end of a cl_string_list object.

References cl_string_list_set(), and _cl_string_list::size.

Referenced by cl_make_set(), cwbci_check_line(), declare_range(), parse_options(), and scan_directory().

char* cl_string_list_get ( cl_string_list  l,
int  n 
)

Retrieves an element from a cl_string_list object.

Parameters:
l The list to search.
n The element to retrieve.
Returns:
The n'th string on the list, or NULL if there is no n'th string.

References _cl_string_list::data, and _cl_string_list::size.

Referenced by cl_make_set(), close_range(), cwbci_check_line(), get_input_line(), main(), open_range(), parse_options(), and print_range_registry_line().

void cl_string_list_lumpsize ( cl_string_list  l,
int  s 
)

Sets the lumpsize of a cl_string_list object.

See also:
LUMPSIZE
Parameters:
l The cl_string_list.
s The new lumpsize.

References _cl_string_list::lumpsize, and LUMPSIZE.

void cl_string_list_qsort ( cl_string_list  l  ) 

Sorts a cl_string_list object.

The list of strings is sorted using cl_strcmp().

See also:
cl_strcmp

References cl_string_list_strcmp(), _cl_string_list::data, and _cl_string_list::size.

Referenced by cl_make_set(), and scan_directory().

void cl_string_list_set ( cl_string_list  l,
int  n,
char *  val 
)

Sets a string pointer on a cl_string_list object.

The n'th element on the list is set to val, and the list is auto-extended if necessary.

References _cl_string_list::allocated, cl_realloc(), _cl_string_list::data, _cl_string_list::lumpsize, and _cl_string_list::size.

Referenced by cl_string_list_append().

int cl_string_list_size ( cl_string_list  l  ) 

Gets the current size of a cl_string_list object (number of elements on the list).

References _cl_string_list::size.

Referenced by cl_make_set(), close_range(), cwbci_check_line(), main(), open_range(), parse_options(), and print_range_registry_line().

unsigned char* cl_string_maptable ( CorpusCharset  charset,
int  flags 
)

Gets a specified character mapping table for use in regular expressions.

Parameters:
charset The character set of this corpus. Currently ignored.
flags The flags that specify which table is required. Can be IGNORE_CASE and/or IGNORE_DIAC.
Returns:
Pointer to the appropriate mapping table.

References IGNORE_CASE, IGNORE_DIAC, latin1_identity_tab, latin1_identity_tab_init, latin1_nocase_nodiac_tab, latin1_nocase_nodiac_tab_init, latin1_nocase_tab, and latin1_nodiac_tab.

Referenced by cl_string_canonical(), and SortSubcorpus().

int ClosePositionStream ( PositionStream ps  ) 

Deletes a PositionStream object.

References BSclose().

Referenced by compress_reversed_index(), and decompress_check_reversed_index().

int* collect_matches ( Attribute attribute,
int *  word_ids,
int  number_of_words,
int  sort,
int *  size_of_table,
int *  restrictor_list,
int  restrictor_list_size 
)

Gets a list of corpus positions matching a list of ids.

This function returns an (ordered) list of all corpus positions which matches one of the ids given in the list of ids. The table is allocated with malloc, so free it when you don't need any more.

The list itself is returned; its size is placed in size_of_table. This size is, of course, the same as the cumulative id frequency of the ids (because each corpus position matching one of the ids is added into the list).

BEWARE: when the id list is rather big or there are highly-frequent ids in the id list (for example, after a call to collect_matching_ids with the pattern ".*") this will give a copy of the corpus -- for which you probably don't have enough memory!!! It is therefore a good idea to call cumulative_id_frequency before and to introduce some kind of bias.

A note on the last two parameters, which are currently unused: restrictor_list is a list of integer pairs [a,b] which means that the returned value only contains positions which fall within at least one of these intervals. The list must be sorted by the start positions, and secondarily by b. restrictor_list_size is the number of integers in this list, NOT THE NUMBER OF PAIRS. WARNING: CURRENTLY UNIMPLEMENTED {NB -- this descrtiption of restrictor_list_size DOESN'T MATCH the one for get_positions(), which this function calls...

REMEMBER: this monster returns a list of corpus indices, not a list of ids.

See also:
collect_matching_ids
get_positions
Parameters:
attribute The P-attribute we are looking in
word_ids A list of item ids (i.e. id codes for items on this attribute).
number_of_words The length of this list.
sort boolean: return sorted list?
size_of_table The size of the allocated table will be placed here.
restrictor_list See function description.
restrictor_list_size See function description.
Returns:
Pointer to the list of corpus positions.

References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_OK, cderrno, check_arg, cl_malloc(), CompLexiconIdx, cumulative_id_frequency(), ensure_component(), get_positions(), intcompare(), and TComponent::size.

Referenced by calculate_initial_matchlist_1(), get_corpus_positions(), and get_matched_corpus_positions().

int* collect_matching_ids ( Attribute attribute,
char *  pattern,
int  flags,
int *  number_of_matches 
)

Gets a list of the ids of those items on a given Attribute that match a particular regular-expression pattern.

The pattern is interpreted with the CL regex engine, q.v.

The function returns a pointer to a sequence of ints of size number_of_matches. The list is allocated with malloc(), so do a cl_free() when you don't need it any more.

See also:
cl_new_regex
Parameters:
attribute The p-attribute to look on.
pattern String containing the pattern against which to match each item on the attribute
flags Flags for the regular expression system via cl_new_regex.
number_of_matches This is set to the number of item ids found, i.e. the size of the returned buffer.
Returns:
A pointer to the list of item ids.

References _CL_Regex::anchor_end, _CL_Regex::anchor_start, ATT_POS, _CL_Regex::buffer, CDA_EBADREGEX, CDA_ENODATA, CDA_OK, cderrno, check_arg, cl_calloc(), cl_debug, cl_delete_regex(), cl_free, cl_malloc(), cl_new_regex(), cl_regex_error, cl_regex_optimised(), cl_string_canonical(), CompLexicon, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), _CL_Regex::flags, _CL_Regex::grain, _CL_Regex::grain_len, _CL_Regex::grains, IGNORE_CASE, IGNORE_DIAC, _CL_Regex::iso_string, _CL_Regex::jumptable, latin1, TComponent::size, and word.

Referenced by get_matched_corpus_positions(), and OptimizeStringConstraint().

int cumulative_id_frequency ( Attribute attribute,
int *  word_ids,
int  number_of_words 
)

Calculates the total frequency of all items on a list of item IDs.

This function returns the sum of the word frequencies of words, which is an array of word_ids with length number_of_words.

The result is therefore the number of corpus positions which match one of the words.

Parameters:
attribute P-attribute on which these items are found.
word_ids An array of item IDs.
number_of_words Length of the word_ids array.
Returns:
Sum of all the frequencies; less than 0 for an error.

References ATT_POS, CDA_ENODATA, CDA_OK, cderrno, check_arg, and get_id_frequency().

Referenced by collect_matches().

void describe_corpus ( Corpus corpus  ) 
int drop_corpus ( Corpus corpus  ) 
Attribute* find_attribute ( Corpus corpus,
char *  attribute_name,
int  type,
char *  data 
)

Finds an attribute that matches the specified parameters, if one exists.

Parameters:
corpus The corpus in which to search for the attribute.
attribute_name The name of the attribute (i.e. the handle it has in the registry file)
type Type of attribute to be searched for.
data NOT USED.
Returns:
Pointer to Attribute object, or NULL if not found.

References _Attribute::any, TCorpus::attributes, STREQ, and _Attribute::type.

Referenced by compute_grouping(), ComputePrintStructures(), do_attribute_show(), do_Description(), do_IDReference(), do_LabelReference(), do_SimpleVariableReference(), do_StringConstraint(), do_StructuralContext(), drop_attribute(), evaluate_target(), findcorpus(), FunctionCall(), get_matched_corpus_positions(), main(), prepare_AlignmentConstraints(), printAlignedStrings(), read_mapping(), RecomputeAL(), red_factor(), Setop(), setup_attribute(), SortSubcorpus(), SystemCorpusSize(), update_context_descriptor(), verify_context_descriptor(), and VerifyList().

Corpus* find_corpus ( char *  registry_dir,
char *  registry_name 
)

Gets a pointer to the Corpus object with the specified CWB-name and registry location.

(Works by searching the loaded_corpora global linked list.)

Parameters:
registry_dir The registry directory.
registry_name The CWB name of the corpus.
Returns:
The Corpus, or NULL if it wasn't found.

References cl_standard_registry, TCorpus::next, TCorpus::registry_dir, TCorpus::registry_name, and STREQ.

Referenced by setup_corpus().

int get_alg_attribute ( Attribute attribute,
int  position,
int *  source_corpus_start,
int *  source_corpus_end,
int *  aligned_corpus_start,
int *  aligned_corpus_end 
)

Gets the corpus positions of an alignment on the given align-attribute.

This is for old-style alignments only: it doesn't (can't) deal with extended alignments. Depracated: use cl_alg2cpos instead (but note its parameters are not identical).

See also:
cl_alg2cpos.
Parameters:
attribute The align-attribute to look on.
position The corpus position {??} of the alignment whose positions are wanted.
source_corpus_start Location to put source corpus start position.
source_corpus_end Location to put source corpus end position.
aligned_corpus_start Location to put target corpus start position.
aligned_corpus_end Location to put target corpus end position.
Returns:
Boolean: true = all OK, false = problem.

References ATT_ALIGN, CDA_ENODATA, CDA_EPOSORNG, CDA_OK, cderrno, check_arg, CompAlignData, TMblob::data, TComponent::data, ensure_component(), get_alignment(), and TComponent::size.

int get_attribute_size ( Attribute attribute  ) 

Gets the maximum position on this P-attribute (ie the size of the attribute).

The result of this function is equal to the number of tokens in the attribute.

If the attribute's item sequence is compressed, this is read from the attribute's Huffman code descriptor block.

Otherwise, it is read from the size member of the Attribute's CompCorpus component.

Returns:
The maximum corpus position, or an error code (if less than 0)

References ATT_POS, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompCorpus, CompHuffCodes, corpus, ensure_component(), POS_Attribute::hc, item_sequence_is_compressed(), _huffman_code_descriptor::length, _Attribute::pos, and TComponent::size.

Referenced by compose_kwic_line(), get_matched_corpus_positions(), get_positions(), OpenPositionStream(), Setop(), and SystemCorpusSize().

int get_bounds_of_nth_struc ( Attribute attribute,
int  struc_num,
int *  struc_start,
int *  struc_end 
)

Retrieves the start-and-end corpus positions of a specified structure of the given s-attribute type.

Parameters:
attribute An s-attribute.
struc_num The instance of that s-attribute to retrieve (i.e. the struc_num'th instance of this s-attribute in the corpus).
struc_start Location to put the starting corpus position.
struc_end Location to put the ending corpus position.
Returns:
boolean: true for all OK, 0 for problem

References ATT_STRUC, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompStrucData, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by calculate_ranges(), and feature_match().

int get_id_at_position ( Attribute attribute,
int  position 
)
int get_id_frequency ( Attribute attribute,
int  id 
)

Gets the frequency of an item on this attribute.

Parameters:
attribute The P-attribute to look on
id Identifier of an item on this attribute.
Returns:
The frequency count of the item specified by id, or an error code (if less than 0)

References ATT_POS, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompCorpusFreqs, TMblob::data, TComponent::data, and ensure_component().

Referenced by call_predefined_function(), compute_code_lengths(), cumulative_id_frequency(), get_id_info(), get_positions(), and OpenPositionStream().

int get_id_from_sortidx ( Attribute attribute,
int  sort_index_position 
)

Gets the ID code of the item at the specified position in the Attribute's sorted wordlist index.

See also:
get_sortidxpos_of_id
Parameters:
attribute The (positional) Attribute whose index is to be searched.
sort_index_position The offset in the index where the ID code is to be found.
Returns:
Either the integer ID, or an error code (if less than 0)

References ATT_POS, CDA_EIDXORNG, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompLexiconSrt, TMblob::data, TComponent::data, and ensure_component().

Referenced by do_show().

char* get_id_info ( Attribute attribute,
int  index,
int *  freq,
int *  slen 
)

Gets the string of the item with the specified ID on the given p-attribute.

As well as returning the string, other information about the item is inserted into locations specified by other parameters.

Parameters:
attribute The P-attribute to look on.
index The ID of the item to look at.
freq Will be set to the frequency of the item.
slen Will be set to the string-length of the item.
Returns:
The string of the item at that position on this attribute, OR NULL if there is an error.

References ATT_POS, CDA_OK, cderrno, check_arg, get_id_frequency(), get_id_string_len(), and get_string_of_id().

Referenced by print_info().

int get_id_of_string ( Attribute attribute,
char *  id_string 
)

Gets the ID code that corresponds to the specified string on the given P-attribute.

Parameters:
attribute The (positional) Attribute to look the string up on
id_string The string of an item on this attribute
Returns:
Either the integer ID of the item, or an error code (if less than 0)

References ATT_POS, CDA_ENODATA, CDA_ENOSTRING, CDA_EOTHER, CDA_OK, cderrno, check_arg, cl_strcmp(), CompLexicon, CompLexiconIdx, CompLexiconSrt, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by call_predefined_function(), get_corpus_positions(), map_token_to_class_number(), member_of_class_s(), OptimizeStringConstraint(), read_mapping(), show_features(), and VerifyVariable().

int get_id_range ( Attribute attribute  ) 

Gets the maximum id on this P-attribute (ie the range of the attribute's ID codes).

The result of this function is equal to the number of types in this attribute.

See also:
get_attribute_size
Returns:
The maximum Id, or an error code (if less than 0)

References ATT_POS, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompLexiconIdx, ensure_component(), and TComponent::size.

Referenced by get_matched_corpus_positions(), get_positions(), OpenPositionStream(), and OptimizeStringConstraint().

int get_id_string_len ( Attribute attribute,
int  id 
)

Calculates the length of the string that corresponds to the specified item on the given P-attribute.

Parameters:
attribute The (positional) Attribute to look up the item on
id Identifier of an item on this attribute.
Returns:
The length of the string, or a CDA_ error code

References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_EOTHER, CDA_OK, cderrno, check_arg, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), get_string_of_id(), and TComponent::size.

Referenced by get_id_info().

int get_nr_of_strucs ( Attribute attribute,
int *  nr_strucs 
)

Gets the number of instances of an s-attribute in the corpus.

Depracated: use cl_max_struc instead.

See also:
cl_max_struc.
Parameters:
attribute The s-attribute to count.
nr_strucs The number of instances is put here.
Returns:
boolean: true for all OK, false for problem.

References ATT_STRUC, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompStrucData, ensure_component(), and TComponent::size.

Referenced by calculate_ranges(), and cl_max_struc().

int get_num_of_struc ( Attribute attribute,
int  position,
int *  struc_num 
)

Gets the ID number of a structure (instance of an s-attribute) that is found at the given corpus position.

Depracated function: use cl_cpos2struc.

See also:
cl_cpos2struc
Parameters:
attribute The s-attribute on which to search.
position The corpus position to look for.
struc_num Location where the number of the structure that is found will be put.
Returns:
Boolean: true for all OK, false for error.

References ATT_STRUC, CDA_ENODATA, CDA_ESTRUC, CDA_OK, cderrno, check_arg, CompStrucData, TMblob::data, TComponent::data, ensure_component(), get_previous_mark(), and TComponent::size.

Referenced by calculate_ranges(), cl_cpos2struc(), and structure_value_at_position().

int* get_positions ( Attribute attribute,
int  id,
int *  freq,
int *  restrictor_list,
int  restrictor_list_size 
)

Gets all the corpus positions where the specified item is found on the given P-attribute.

The restrictor list is a set of ranges in which instances of the item MUST occur to be collected by this function. If no restrictor list is specified (i.e. restrictor_list is NULL), then ALL corpus positions where the item occurs are returned.

This restrictor list has the form of a list of ranges {start,end} of size restrictor_list_size, that is, the number of ints in this area is 2 * restrictor_list_size!!!

Parameters:
attribute The P-attribute to look on.
id The id of the item to look for.
freq The frequency of the specified item is written here. This will be 0 in the case of errors.
restrictor_list A list of pairs of integers specifying ranges {start,end} in the corpus
restrictor_list_size The number of PAIRS of ints in the restrictor list.
Returns:
Pointer to the list of corpus positions; or NULL in case of error.

References ATT_POS, BSclose(), BSopen(), BSseek(), CDA_EIDORNG, CDA_ENODATA, CDA_OK, cderrno, check_arg, cl_free, cl_malloc(), cl_realloc(), CompCompRF, CompCompRFX, CompRevCorpus, CompRevCorpusIdx, compute_ba(), TMblob::data, TComponent::data, ensure_component(), get_attribute_size(), get_id_frequency(), get_id_range(), inverted_file_is_compressed(), range, and read_golomb_code_bs().

Referenced by calculate_initial_matchlist_1(), and collect_matches().

int get_sortidxpos_of_id ( Attribute attribute,
int  id 
)

Gets the position in the Attribute's sorted wordlist index of the item with the specified ID code.

This function is NOT YET IMPLEMENTED.

See also:
get_id_from_sortidx
Parameters:
attribute The (positional) Attribute whose index is to be searched
id Identifier of an item on this attribute.
Returns:
The offset of that item in the sorted wordlist index.

References ATT_POS, CDA_ENODATA, CDA_ENYI, CDA_OK, cderrno, check_arg, CompLexiconSrt, and ensure_component().

char* get_string_at_position ( Attribute attribute,
int  position 
)

Gets the string of the item at the specified position on the given p-attribute.

Parameters:
attribute The P-attribute to look on.
position The corpus position to look at.
Returns:
The string of the item at that position on this attribute, OR NULL if there is an error.

References ATT_POS, CDA_OK, cderrno, check_arg, get_id_at_position(), and get_string_of_id().

Referenced by get_leaf_value(), get_position_values(), and print_next_region().

char* get_string_of_id ( Attribute attribute,
int  id 
)

Gets the string that corresponds to the specified item on the given P-attribute.

Parameters:
attribute The Attribute to look the item up on
id Identifier of an item on this attribute.
Returns:
The string, or NULL if there is an error

References ATT_POS, CDA_EIDORNG, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompLexicon, CompLexiconIdx, TMblob::data, TComponent::data, ensure_component(), and TComponent::size.

Referenced by call_predefined_function(), compute_code_lengths(), eval_bool(), get_id_info(), get_id_string_len(), get_string_at_position(), and print_mapping().

int get_struc_attribute ( Attribute attribute,
int  position,
int *  struc_start,
int *  struc_end 
)

Gets the start and end positions of the instance of the given S-attribute found at the specified corpus position.

This function finds one particular instance of the S-attribute, and assigns its start and end points to the locations given as arguments.

Parameters:
attribute The s-attribute to search.
position The corpus position to search for.
struc_start Location for the start position of the instance.
struc_end Location for the end position of the instance.

References ATT_STRUC, CDA_ENODATA, CDA_ESTRUC, CDA_OK, cderrno, check_arg, CompStrucData, TMblob::data, TComponent::data, ensure_component(), get_previous_mark(), and TComponent::size.

Referenced by calculate_ranges(), eval_bool(), get_leaf_value(), meet_mu(), show_position_values(), and simulate().

int inverted_file_is_compressed ( Attribute attribute  ) 

Check whether the index (inverted file) of the given P-attribute is compressed.

See comments in body of function for what counts as "compressed".

Returns:
Boolean.

References ATT_POS, cderrno, check_arg, CompCompRF, CompCompRFX, component_state(), ComponentLoaded, ComponentUnloaded, CompRevCorpus, and CompRevCorpusIdx.

Referenced by get_positions(), and OpenPositionStream().

int item_sequence_is_compressed ( Attribute attribute  ) 

Checks whether the item sequence of the given P-attribute is compressed.

See comments in body of function for what counts as "compressed".

Returns:
Boolean.

References ATT_POS, cderrno, check_arg, CompCorpus, CompHuffCodes, CompHuffSeq, CompHuffSync, component_state(), ComponentLoaded, ComponentUnloaded, POS_Attribute::hc, and _Attribute::pos.

Referenced by get_attribute_size(), get_id_at_position(), and load_component().

int nr_of_arguments ( Attribute attribute  ) 

Count the number of arguments on an attribute's dynamic argument list.

Parameters:
attribute pointer to the Attribute object to analyse; it must be a dynamic attribute.
Returns:
integer specifying the number of arguments; a negative integer is returned if for any argument on dyn.arglist, the type is equal to ATTAT_VAR

References Dynamic_Attribute::arglist, ATT_DYN, ATTAT_VAR, CDA_OK, cderrno, check_arg, _Attribute::dyn, _DynArg::next, and _DynArg::type.

PositionStream OpenPositionStream ( Attribute attribute,
int  id 
)
int ReadPositionStream ( PositionStream  ps,
int *  buffer,
int  buffer_size 
)

Reads corpus positions from a position stream to a buffer.

Parameters:
ps The position stream to read.
buffer Location to put the resulting item positions.
buffer_size Maximum number of item positions to read. (Fewer will be read if fewer are available).
Returns:
The number of item positions that have been read. This may be less than buffer_size (and will be 0 if there are no instances of this item left).

References _position_stream_rec_::b, _position_stream_rec_::base, _position_stream_rec_::bs, _position_stream_rec_::id_freq, _position_stream_rec_::is_compressed, _position_stream_rec_::last_pos, _position_stream_rec_::nr_items, and read_golomb_code_bs().

Referenced by compress_reversed_index(), and decompress_check_reversed_index().

Corpus* setup_corpus ( char *  registry_dir,
char *  registry_name 
)

Creates a Corpus object.

Parameters:
registry_dir Path to the CWB registry directory from which the corpus is to be loaded. This may be NULL, in which case the default registry directory is used.
registry_name The CWB-name of the indexed corpus to load (in the all-lowercase form)
Returns:
Pointer to the resulting Corpus object.

References central_corpus_directory(), check_access_conditions(), cl_free, cl_strdup(), cl_string_canonical(), corpus, cregin, cregin_name, cregin_path, cregparse(), cregrestart(), drop_corpus(), find_corpus(), find_corpus_registry(), TCorpus::id, IGNORE_CASE, TCorpus::next, TCorpus::nr_of_loads, TCorpus::registry_dir, and TCorpus::registry_name.

Referenced by GetSystemCorpus(), main(), and printAlignedStrings().

int structure_has_values ( Attribute attribute  ) 
char* structure_value ( Attribute attribute,
int  struc_num 
)

Gets the value that is associated with the specified instance of the given s-attribute.

Parameters:
attribute An S-attribute.
struc_num ID of the structure whose value is wanted (ie, function gets value of struc_num'th instance of this s-attribute)
Returns:
A string; or NULL in case of error. Note that this string is a pointer to the depths of the Attribute object itself, as this function does not strdup() its result -- so don't free this return value!

References ATT_STRUC, CDA_EIDXORNG, CDA_EINTERNAL, CDA_ENODATA, CDA_OK, cderrno, check_arg, CompStrucAVS, CompStrucAVX, TMblob::data, TComponent::data, ensure_component(), s_v_comp(), TComponent::size, and structure_has_values().

Referenced by structure_value_at_position().

char* structure_value_at_position ( Attribute struc,
int  position 
)

Gets the value associated with the instance of the given s-attribute that occurs at the specified corpus position.

Parameters:
struc The s-attribute to search through.
position The corpus position being queried.
Returns:
The value of the instance of the s-attribute, or NULL for error.

References get_num_of_struc(), and structure_value().

Referenced by get_leaf_value(), and get_print_attribute_values().


Variable Documentation

int cderrno

The error message from (POSIX) regex compilation are placed in this buffer if cl_new_regex() fails.

Referenced by cl_new_regex(), and collect_matching_ids().


Generated on Sun Feb 28 18:08:04 2010 for CWB by  doxygen 1.6.1