From e67858577ea363021949017e55f3d8c95bf9159a Mon Sep 17 00:00:00 2001 From: Mayeul Kauffmann Date: Tue, 17 Oct 2023 13:50:28 +0200 Subject: [PATCH 1/4] Update annotate.r typos --- R/annotate.r | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/R/annotate.r b/R/annotate.r index 9f37414..cd1908d 100644 --- a/R/annotate.r +++ b/R/annotate.r @@ -5,8 +5,8 @@ #' The first column contains the ids for each hit. The second column contains the annotation label. The third column contains the fill level (which you probably won't use, but is important for some functionalities). #' Only nodes that are given a name in the tquery (using the 'label' parameter) will be added as annotation. #' -#' Note that while queries only find 1 node for each labeld component of a pattern (e.g., quote queries have 1 node for "source" and 1 node for "quote"), -#' all children of these nodes can be annotated by settting fill to TRUE. If a child has multiple ancestors, only the most direct ancestors are used (see documentation for the fill argument). +#' Note that while queries only find 1 node for each labeled component of a pattern (e.g., quote queries have 1 node for "source" and 1 node for "quote"), +#' all children of these nodes can be annotated by setting fill to TRUE. If a child has multiple ancestors, only the most direct ancestors are used (see documentation for the fill argument). #' #' @param tokens A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}. #' @param column The name of the column in which the annotations are added. The unique ids are added as column_id @@ -86,7 +86,7 @@ annotate_tqueries <- function(tokens, column, ..., block=NULL, fill=TRUE, overwr #' Note that you can also directly use \link{annotate}. #' #' @param tokens A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}. -#' @param nodes An rsyntaxNodes A data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables. +#' @param nodes An rsyntaxNodes data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables. #' @param column The name of the column in which the annotations are added. The unique ids are added as [column]_id, and the fill values are added as [column]_fill. #' #' @export @@ -147,7 +147,7 @@ annotate_nodes <- function(tokens, nodes, column) { #' #' @param tokens A tokenIndex data.table, or any data.frame coercible with \link{as_tokenindex}. #' @param nodes A data.table, as created with \link{apply_queries}. Can be a list of multiple data.tables. -#' @param use Optionally, specify which columns from nodes to add. Other than convenient, this is slighly different +#' @param use Optionally, specify which columns from nodes to add. Other than convenient, this is slightly different #' from subsetting the columns in 'nodes' beforehand if fill is TRUE. When the children are collected, #' the ids from the not-used columns are still blocked (see 'block') #' @param token_cols A character vector, specifying which columns from tokens to include in the output From 7d352856d68adb1418350da94e87863a50bc2d3d Mon Sep 17 00:00:00 2001 From: Mayeul Kauffmann Date: Tue, 17 Oct 2023 13:56:46 +0200 Subject: [PATCH 2/4] Update applied_reshape.r Typos --- R/applied_reshape.r | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/applied_reshape.r b/R/applied_reshape.r index 81b6725..143cec0 100644 --- a/R/applied_reshape.r +++ b/R/applied_reshape.r @@ -2,7 +2,7 @@ #' Split conjunctions for dependency trees in Universal Dependencies #' -#' @param tokens a tokenIndex based on texts parsed with \code{\link[spacyr]{spacy_parse}} (with dependency=TRUE) +#' @param tokens A tokenIndex based on texts parsed with \code{\link[spacyr]{spacy_parse}} (with dependency=TRUE) #' @param conj_rel The dependency relation for conjunctions. By default conj #' @param cc_rel The dependency relation for the coordinating conjunction. By default cc. This will be removed. #' @param unpack If TRUE (default), create separate branches for the parent and the node that inherits the parent position @@ -11,10 +11,10 @@ #' @param max_dist Optionally, a maximum distance between the conj node and its parent #' @param right_fill_dist Should fill to the right of the conjunction be used? #' @param compound_rel The relation types indicating compounds -#' @param ... specify conditions for the conjunction token. For instance, using 'pos = "VERB"' to only split VERB conjunctions. +#' @param ... Specify conditions for the conjunction token. For instance, using 'pos = "VERB"' to only split VERB conjunctions. #' This is especially usefull to use different no_fill conditions. #' -#' @return A tokenindex +#' @return A tokenIndex #' @export #' #' @examples @@ -39,7 +39,7 @@ split_UD_conj <- function(tokens, conj_rel='conj', cc_rel=c('cc','cc:preconj'), #' Have a node adopt its parent's position #' -#' given a tquery that identfies a node labeled "origin", that has a parent labeled "target", +#' Given a tquery that identifies a node labeled "origin", that has a parent labeled "target", #' recursively have child adopt the parent's position (parent and relation column) #' and adopt parents fill nodes. only_new restricts adding fill nodes to relations that child #' does not already have. This seems to be a good heuristic for dealing with argument drop @@ -50,7 +50,7 @@ split_UD_conj <- function(tokens, conj_rel='conj', cc_rel=c('cc','cc:preconj'), #' @param isolate If unpack is TRUE and isolate is TRUE (default is FALSE), isolate the new branch by recursively unpacking #' @param take_fill If TRUE (default), give the node that will inherit the parent position a copy of the parent children (but only if it does not already have children with this relation; see only_new) #' @param give_fill If TRUE (default), copy the children of the node that will inherit the parent position to the parent (but only if it does not already have children with this relation; see only_new) -#' @param only_new A characetr vector giving one or multiple column names that need to be unique for take_fill and give_fill +#' @param only_new A character vector giving one or multiple column names that need to be unique for take_fill and give_fill #' @param max_iter The climb tree function repeatedly resolves the first conjunction it encounters in a sentence. This can lead to many iterations #' for sentences with many (nested) conjunctions. It could be the case that in unforseen cases or with certain parsers #' an infinite loop is reached, which is why we use a max_iter argument that breaks the loop and sends a warning if the max is reached. @@ -114,7 +114,7 @@ climb_tree <- function(.tokens, tq, unpack=TRUE, isolate=TRUE, take_fill=TRUE, g .tokens = select_nodes(.tokens, tq2, fill_only_first = FALSE) ## copy the parent .tokens = copy_nodes(.tokens, 'parent', 'new_parent', copy_fill = FALSE) - ## point the duplicate children towards new copy + ## point the duplicate children towards new copy .tokens = mutate_nodes(.tokens, 'child', parent=new_parent$token_id) ## and add the parent fill for which relation is not already in copy .tokens = copy_fill(.tokens, 'parent', 'new_parent', only_new = 'relation') @@ -200,9 +200,9 @@ one_per_sentence <- function(.tokens) { } -#' Chop of a branch of the tree +#' Chop off a branch of the tree #' -#' Using the query language for tquery, chop of the branch down from the node that is found +#' Using the query language for tquery, chop off the branch down from the node that is found #' #' @param .tokens A tokenIndex #' @param ... Arguments passed to tquery. For instance, relation = 'punct' cuts off all punctuation dependencies (in universal dependencies) From de4bc97cf2630fd6e34c645cbd56aa874f04fb9a Mon Sep 17 00:00:00 2001 From: Mayeul Kauffmann Date: Tue, 17 Oct 2023 14:03:34 +0200 Subject: [PATCH 3/4] Update find_quotes.r typos --- R/find_quotes.r | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/R/find_quotes.r b/R/find_quotes.r index 12af8f4..c557307 100644 --- a/R/find_quotes.r +++ b/R/find_quotes.r @@ -1,13 +1,13 @@ #' Add span quotes to a source-quote annotations #' #' @description -#' Quotes can span across sentences, which makes it impossible to find them based on dependency tree quories. +#' Quotes can span across sentences, which makes it impossible to find them based on dependency tree queries. #' This function can be used as post-processing, AFTER using tqueries to find 'source' and 'quote' nodes, to add some of these quotes. #' #' The quotes themselves are often easy to detect due to the use of quotation marks. There are two common ways of indicating the sources. #' #' Firstly, the source might be used before the start of the quote (Steve said: "hey a quote!". "I like quotes!"). -#' Secondly, the source might be implied in the sentence where the quotes starts, or the sentence before that (Steve was mad. "What a stupid way of quoting me!"). +#' Secondly, the source might be implied in the sentence where the quote starts, or the sentence before that (Steve was mad. "What a stupid way of quoting me!"). #' #' In the first case, the source can be found with a tquery. If there is a source (source_val) in the quote_col that is linked to a part of the quote (quote_val), this function will add the rest of the quote. #' @@ -20,13 +20,13 @@ #' @param quote_val The value in quote_col that indicates the quote #' @param tqueries A list of tqueries, that will be performed to find source candidates. The order of the queries determines which source candidates are preferred. It would make sense to use the same value as in source_val in the 'label' argument for the tquery. #' @param par_col If available in the parser output, the column with the paragraph id. We can assume that quotes do not span across paragraphs. By using this argument, quotes that are not properly closed (uneven number of quotes) will stop at the end of the paragraph -#' @param space_col If par_col is not used, paragraphs will be identified based on hard enters in the text_col. In some parsers, there is an additional "space" column that hold the whitespace and linebreaks, which can be included here. +#' @param space_col If par_col is not used, paragraphs will be identified based on hard enters in the text_col. In some parsers, there is an additional "space" column that hold the whitespace and line breaks, which can be included here. #' @param lag_sentences The max number of sentences looked backwards to find source candidates. Default is 1, which means the source candidates have to occur in the sentence where the quote begins (lag = 0) or the sentence before that (lag = 1) #' @param add_quote_symbols Optionally, add additional punctuation symbols for finding quotation marks. -#' In some contexts and languages it makes sense to add single quotes, but in that case it is oftne necessary to -#' also use the quote_subset argument. For instance, in Spacy (and probably other UD based annotations), single quotes in posessives (e.g., Bob's, scholars') have a +#' In some contexts and languages it makes sense to add single quotes, but in that case it is often necessary to +#' also use the quote_subset argument. For instance, in Spacy (and probably other UD based annotations), single quotes in possessives (e.g., Bob's, scholars') have a #' PART POS tag, whereas quotation symbols have PUNCT, NOUN, VERB, or ADJ (for some reason). -#' @param quote_subset Optionally, an expression to be evaluated on the columns of 'tokens' for selecting/deselecting tokens that can/cant be quotation marks. For example, +#' @param quote_subset Optionally, an expression to be evaluated on the columns of 'tokens' for selecting/deselecting tokens that can/can't be quotation marks. For example, #' pos != "PART" can be used for the example mentioned in add_quote_symbols. #' @param copy If TRUE, deep copy the data.table (use if output tokens do not overwrite input tokens) #' @@ -65,7 +65,7 @@ #' tokens = annotate_tqueries(tokens, 'quote', dir=direct, nos=nosrc, acc=according) #' tokens #' -#' ## now we add the span quotes. If a span quote is found, the algorithm will first +#' ## Now we add the span quotes. If a span quote is found, the algorithm will first #' ## look for already annotated sources as source candidates. If there are none, #' ## additional tqueries can be used to find candidates. Here we simply look for #' ## the most recent PERSON entity @@ -80,7 +80,7 @@ #' tqueries=last_person) #' tokens #' -#' ## view as full text +#' ## View as full text #' syntax_reader(tokens, annotation = 'quote', value = 'source') #' } add_span_quotes <- function(tokens, text_col, quote_col='quotes', source_val='source', quote_val='quote', tqueries=NULL, par_col=NULL, space_col=NULL, lag_sentences=1, add_quote_symbols=NULL, quote_subset=NULL, copy=TRUE) { From 1798b05d140d60370fb0c9f2c8fd9d3d939b1386 Mon Sep 17 00:00:00 2001 From: Mayeul Kauffmann Date: Tue, 17 Oct 2023 14:25:28 +0200 Subject: [PATCH 4/4] Update generic_reshape.r Typos. Heading for selected_nodes() documentation. --- R/generic_reshape.r | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/R/generic_reshape.r b/R/generic_reshape.r index 5e8f3b3..b4b23fb 100644 --- a/R/generic_reshape.r +++ b/R/generic_reshape.r @@ -6,7 +6,7 @@ #' @param fill_only_first Logical, should a node only be filled once, with the nearest (first) labeled node? #' @param .one_per_sentence If true, only one match per sentence is used, giving priority to paterns closest to the root (or fartest from the root if .order = -1). #' This is sometimes necessary to deal with recursion. -#' @param .order If .one_per_sentence is used, .order determines whether the paterns closest to (1) or farthest away (-1) are used. +#' @param .order If .one_per_sentence is used, .order determines whether the patterns closest to (1) or farthest away (-1) are used. #' #' @return A tokenIndex with a .nodes attribute, that enables the use of reshape operations on the selected nodes #' @export @@ -131,8 +131,10 @@ unselect_nodes <- function(.tokens) { .tokens } +#' Extract nodes selected with select_nodes() +#' #' If select_nodes() is used, the selected nodes can be extracted with selected_nodes(). -#' This is mainly for internal use, but it can also be usefull for debugging, and to controll +#' This is mainly for internal use, but it can also be useful for debugging, and to control #' loops of reshape operation (e.g. break if no selected nodes left) #' #' @param .tokens A tokenIndex in which nodes are selected with \link{select_nodes}. @@ -681,7 +683,7 @@ increment_sub_id <- function(x) { dec = ifelse(dec > 0, stringi::stri_extract_first_regex(x, '[0-9]+$'), '0') dec = as.numeric(stringi::stri_reverse(dec)) + 1 sub_id = as.numeric(stringi::stri_reverse(sprintf('%.1f', dec))) - round(int_x + sub_id, 6) ## round is necessary, because sub_id is sliiiiiigly more than what you see, which messes up the stri_count_regex for number of digits + round(int_x + sub_id, 6) ## round is necessary, because sub_id is sliiiiiightly more than what you see, which messes up the stri_count_regex for number of digits } add_sub_id <- function(.tokens, ids) {