From a8f38ed9c978fb78d301c8fcf47a8b97c5cf980f Mon Sep 17 00:00:00 2001 From: mpadge Date: Fri, 13 Sep 2024 11:00:56 +0200 Subject: [PATCH 1/5] add tree-sitter tag parser for #4 --- DESCRIPTION | 4 ++- R/tag-data-tree-sitter.R | 77 ++++++++++++++++++++++++++++++++++++++++ R/utils.R | 1 + codemeta.json | 2 +- 4 files changed, 82 insertions(+), 2 deletions(-) create mode 100644 R/tag-data-tree-sitter.R diff --git a/DESCRIPTION b/DESCRIPTION index 06b897c..e1656db 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pkgstats Title: Metrics of R Packages -Version: 0.1.6.017 +Version: 0.1.6.018 Authors@R: person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2172-5265")) @@ -25,6 +25,8 @@ Imports: methods, readr, sys, + treesitter, + treesitter.r, withr Suggests: callr, diff --git a/R/tag-data-tree-sitter.R b/R/tag-data-tree-sitter.R new file mode 100644 index 0000000..b30ebec --- /dev/null +++ b/R/tag-data-tree-sitter.R @@ -0,0 +1,77 @@ +#' Get tags using tree-sitter +#' +#' Currently extracts tags from the "R" directory only, and only for R-language files. +#' +#' @inheritParams tags_data +#' @noRd +get_treesitter_tags <- function (path) { + + language <- treesitter.r::language () + parser <- treesitter::parser (language) + + tree_sitter_calls (parser, path) +} + +tree_sitter_calls <- function (parser, path, d = "R") { + flist <- fs::dir_ls (fs::path (path, d), pattern = "\\.R$") + fn_calls <- lapply (flist, function (f) { + parse_list <- control_parse (f) + fn_calls <- lapply (parse_list, function (p) { + txt <- paste0 (as.character (p), collapse = "\n") + tree <- treesitter::parser_parse (parser, txt) + walk_one_tree (tree) + }) + res <- do.call (rbind, fn_calls) + cbind (file = rep (f, nrow (res)), res) + }) + fn_calls <- do.call (rbind, fn_calls) +} + +walk_one_tree <- function (tree) { + + it <- treesitter::tree_walk (tree) + + reached_foot <- FALSE + first_identifier <- TRUE + get_next_open <- FALSE + + grammar_types <- node_text <- next_open <- fn_name <- character (0L) + + while (!reached_foot) { + field_name <- NA_to_char (it$field_name ()) + grammar_type <- NA_to_char (treesitter::node_grammar_type (it$node ())) + if (field_name == "function" && !grammar_type %in% c ("call", "extract_operator")) { + grammar_types <- c (grammar_types, grammar_type) + node_text <- c (node_text, treesitter::node_text (it$node ())) + get_next_open <- TRUE + } else if (grammar_type == "identifier" && first_identifier) { + fn_name <- treesitter::node_text (it$node ()) + first_identifier <- FALSE + } else if (get_next_open && field_name == "open") { + next_open <- c (next_open, grammar_type) + get_next_open <- FALSE + } + + if (it$goto_first_child ()) next + if (it$goto_next_sibling ()) next + + retracing <- TRUE + while (retracing) { + if (!it$goto_parent ()) { + retracing <- FALSE + reached_foot <- TRUE + } + if (it$goto_next_sibling ()) { + retracing <- FALSE + } + } + } + + # This line ensures fn_name is also length 0 when no data are parsed: + fn_name <- rep (fn_name, length (grammar_types)) + data.frame ( + fn_name = fn_name, + grammar_type = grammar_types, + node_text = node_text + ) [which (next_open != "["), ] +} diff --git a/R/utils.R b/R/utils.R index bb4a210..265fbc9 100644 --- a/R/utils.R +++ b/R/utils.R @@ -1,3 +1,4 @@ +NA_to_char <- function (i) ifelse (is.na (i), "", i) get_Rd_metadata <- utils::getFromNamespace (".Rd_get_metadata", "tools") # nolint diff --git a/codemeta.json b/codemeta.json index e5835e1..944523c 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,7 @@ "codeRepository": "https://github.com/ropensci-review-tools/pkgstats", "issueTracker": "https://github.com/ropensci-review-tools/pkgstats/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.6.017", + "version": "0.1.6.018", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", From 1a4c09f0df9939897037b0734c682882ccd51fa2 Mon Sep 17 00:00:00 2001 From: mpadge Date: Fri, 13 Sep 2024 11:08:28 +0200 Subject: [PATCH 2/5] rm row names from tree-sitter tag df for #4 --- DESCRIPTION | 2 +- R/tag-data-tree-sitter.R | 2 ++ codemeta.json | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e1656db..1a5e333 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pkgstats Title: Metrics of R Packages -Version: 0.1.6.018 +Version: 0.1.6.019 Authors@R: person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2172-5265")) diff --git a/R/tag-data-tree-sitter.R b/R/tag-data-tree-sitter.R index b30ebec..ae47190 100644 --- a/R/tag-data-tree-sitter.R +++ b/R/tag-data-tree-sitter.R @@ -25,6 +25,8 @@ tree_sitter_calls <- function (parser, path, d = "R") { cbind (file = rep (f, nrow (res)), res) }) fn_calls <- do.call (rbind, fn_calls) + rownames (fn_calls) <- NULL + return (fn_calls) } walk_one_tree <- function (tree) { diff --git a/codemeta.json b/codemeta.json index 944523c..687a224 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,7 @@ "codeRepository": "https://github.com/ropensci-review-tools/pkgstats", "issueTracker": "https://github.com/ropensci-review-tools/pkgstats/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.6.018", + "version": "0.1.6.019", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", From b8a25a91b87715e7fe27d4ea182fcaad18ba7233 Mon Sep 17 00:00:00 2001 From: mpadge Date: Fri, 13 Sep 2024 11:30:47 +0200 Subject: [PATCH 3/5] add row start & end to tree-sitter df for #4 --- R/tag-data-tree-sitter.R | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/R/tag-data-tree-sitter.R b/R/tag-data-tree-sitter.R index ae47190..5f09456 100644 --- a/R/tag-data-tree-sitter.R +++ b/R/tag-data-tree-sitter.R @@ -33,11 +33,18 @@ walk_one_tree <- function (tree) { it <- treesitter::tree_walk (tree) + get_row_start_end <- function (it) { + point_start <- treesitter::node_start_point (it$node ()) + point_end <- treesitter::node_end_point (it$node ()) + c (treesitter::point_row (point_start), treesitter::point_row (point_end)) + } + reached_foot <- FALSE first_identifier <- TRUE get_next_open <- FALSE grammar_types <- node_text <- next_open <- fn_name <- character (0L) + row_start <- row_end <- integer (0L) while (!reached_foot) { field_name <- NA_to_char (it$field_name ()) @@ -45,6 +52,9 @@ walk_one_tree <- function (tree) { if (field_name == "function" && !grammar_type %in% c ("call", "extract_operator")) { grammar_types <- c (grammar_types, grammar_type) node_text <- c (node_text, treesitter::node_text (it$node ())) + row <- get_row_start_end (it) + row_start <- c (row_start, row [1L]) + row_end <- c (row_end, row [1L]) get_next_open <- TRUE } else if (grammar_type == "identifier" && first_identifier) { fn_name <- treesitter::node_text (it$node ()) @@ -74,6 +84,8 @@ walk_one_tree <- function (tree) { data.frame ( fn_name = fn_name, grammar_type = grammar_types, - node_text = node_text + node_text = node_text, + start = row_start, + end = row_end ) [which (next_open != "["), ] } From a67bd33dde32426f74737ac9b7f34e1b03891dba Mon Sep 17 00:00:00 2001 From: mpadge Date: Fri, 13 Sep 2024 11:58:50 +0200 Subject: [PATCH 4/5] extract external calls from tree-sitter for #4 --- DESCRIPTION | 2 +- R/external-calls.R | 30 +++++++++++++++++++++++++----- R/tag-data.R | 4 ++-- codemeta.json | 2 +- 4 files changed, 29 insertions(+), 9 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 1a5e333..433a2fe 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pkgstats Title: Metrics of R Packages -Version: 0.1.6.019 +Version: 0.1.6.020 Authors@R: person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2172-5265")) diff --git a/R/external-calls.R b/R/external-calls.R index be8c5db..2e87e01 100644 --- a/R/external-calls.R +++ b/R/external-calls.R @@ -1,4 +1,3 @@ - #' Map calls from within each function to external packages #' #' @param path Path to package being analysed @@ -8,15 +7,29 @@ #' @noRd external_call_network <- function (tags_r, path, pkg_name) { - calls <- extract_call_content (tags_r) + tags_are_treesitter <- "grammar_type" %in% names (tags_r) + + if (tags_are_treesitter) { + calls <- data.frame ( + tags_line = tags_r$start, + call = tags_r$node_text, + tag = tags_r$grammar_type, + file = paste0 ("R/", basename (tags_r$file)) + ) + tags_r$kind <- "function" + index <- seq_len (nrow (tags_r)) + } else { # ctags + calls <- extract_call_content (tags_r) + index <- calls$tags_line + } if (length (calls) == 0L || nrow (calls) == 0L) { return (NULL) } - calls$kind <- tags_r$kind [calls$tags_line] - calls$start <- tags_r$start [calls$tags_line] - calls$end <- tags_r$end [calls$tags_line] + calls$kind <- tags_r$kind [index] + calls$start <- tags_r$start [index] + calls$end <- tags_r$end [index] calls$package <- NA_character_ pkg_fns <- unique (tags_r$tag [tags_r$kind == "function"]) @@ -39,6 +52,13 @@ external_call_network <- function (tags_r, path, pkg_name) { calls <- calls [which (!is.na (calls$package)), ] + # Some of these from tree-sitter flag sub-setting operations as calls; + # these are removed here: + index <- grep ("\\\"|\\(|\\)|\\]|\\]", calls$call) + if (length (index) > 0L) { + calls <- calls [-index, ] + } + rownames (calls) <- NULL return (calls) diff --git a/R/tag-data.R b/R/tag-data.R index d77d323..26d6dc0 100644 --- a/R/tag-data.R +++ b/R/tag-data.R @@ -1,4 +1,3 @@ - #' use ctags and gtags to parse call data #' #' @param path Path to local repository @@ -61,7 +60,8 @@ tags_data <- function (path, has_tabs = NULL, pkg_name = NULL) { tags_r <- withr::with_dir (path, get_ctags ("R", has_tabs)) - external_calls <- external_call_network (tags_r, path, pkg_name) + tags_treesitter <- get_treesitter_tags (path) + external_calls <- external_call_network (tags_treesitter, path, pkg_name) tags_src <- withr::with_dir (path, get_ctags ("src", has_tabs)) tags_inst <- withr::with_dir (path, get_ctags ("inst", has_tabs)) diff --git a/codemeta.json b/codemeta.json index 687a224..b14b32b 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,7 @@ "codeRepository": "https://github.com/ropensci-review-tools/pkgstats", "issueTracker": "https://github.com/ropensci-review-tools/pkgstats/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.6.019", + "version": "0.1.6.020", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R", From d7bf16874609f33e5a9d8ca1280267ff44be3704 Mon Sep 17 00:00:00 2001 From: mpadge Date: Fri, 13 Sep 2024 12:38:14 +0200 Subject: [PATCH 5/5] fix bug in counting external_calls for #4 --- DESCRIPTION | 2 +- R/external-calls.R | 8 ++++++-- codemeta.json | 2 +- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index 433a2fe..90d4ef4 100755 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: pkgstats Title: Metrics of R Packages -Version: 0.1.6.020 +Version: 0.1.6.021 Authors@R: person("Mark", "Padgham", , "mark.padgham@email.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-2172-5265")) diff --git a/R/external-calls.R b/R/external-calls.R index 2e87e01..fa84741 100644 --- a/R/external-calls.R +++ b/R/external-calls.R @@ -32,8 +32,12 @@ external_call_network <- function (tags_r, path, pkg_name) { calls$end <- tags_r$end [index] calls$package <- NA_character_ - pkg_fns <- unique (tags_r$tag [tags_r$kind == "function"]) - pkg_fns <- pkg_fns [which (!grepl ("^anonFunc", pkg_fns))] + if (tags_are_treesitter) { + pkg_fns <- unique (tags_r$fn_name) + } else { + pkg_fns <- unique (tags_r$tag [tags_r$kind == "function"]) + pkg_fns <- pkg_fns [which (!grepl ("^anonFunc", pkg_fns))] + } calls$package [which (calls$call %in% pkg_fns)] <- pkg_name calls <- add_base_recommended_pkgs (calls) diff --git a/codemeta.json b/codemeta.json index b14b32b..20ea3bb 100644 --- a/codemeta.json +++ b/codemeta.json @@ -8,7 +8,7 @@ "codeRepository": "https://github.com/ropensci-review-tools/pkgstats", "issueTracker": "https://github.com/ropensci-review-tools/pkgstats/issues", "license": "https://spdx.org/licenses/GPL-3.0", - "version": "0.1.6.020", + "version": "0.1.6.021", "programmingLanguage": { "@type": "ComputerLanguage", "name": "R",