Merge pull request #87 from stemangiola/fix-unnest-with-new-purrr

stemangiola · web-flow · commit 5c553dc7b0dd · 2024-04-22T16:23:57.000+10:00
Fix unnest with new purrr
diff --git a/R/dplyr_methods.R b/R/dplyr_methods.R
@@ -62,29 +62,82 @@ bind_rows.SummarizedExperiment <- function(..., .id=NULL, add.cell.ids=NULL) {
 bind_cols_internal <- function(..., .id=NULL, column_belonging=NULL) {
     tts <- tts <- flatten_if(dots_values(...), is_spliced)
 
-    tts[[1]] |> 
-        as_tibble(skip_GRanges = TRUE) |>
-        dplyr::bind_cols(tts[[2]], .id=.id) %>%
-        when(
-
-            # If the column added are not sample-wise
-            # or feature-wise, return tibble
-            (colnames(tts[[2]]) %in% c(
-                get_subset_columns(., !!s_(tts[[1]])$symbol),
-                get_subset_columns(., !!f_(tts[[1]])$symbol)
-            )
-            ) |> all() ~ update_SE_from_tibble(., tts[[1]],
-                column_belonging=column_belonging),
 
-            # Return tiblle
-            ~ {
+    # If I have column corresponding bind directly
+    # Without tranformation to tibble
+    if(!is.null(column_belonging)){
+      
+      # For colData
+      colData_additions = column_belonging[column_belonging==s_(tts[[1]] )$name] |> names()
+      
+      data_frame_to_attach = 
+        tts[[1]] |> 
+        select(!!s_(tts[[1]] )$symbol) |> 
+        suppressMessages() |> 
+        bind_cols(tts[[2]] |> select(all_of(colData_additions))) |> 
+        distinct() 
+      
+      # Set row names
+      data_frame_to_attach = 
+        data_frame_to_attach |> 
+        select(-1) |> 
+        DataFrame(row.names = data_frame_to_attach |> pull(1)) 
+      
+      # Reorder
+      data_frame_to_attach = data_frame_to_attach[match(rownames(data_frame_to_attach), colnames(tts[[1]])), , drop=FALSE]
+      
+      # Attach
+      colData(tts[[1]]) = cbind(colData(tts[[1]]), data_frame_to_attach)
+      
+      # For rowData
+      rowData_additions = column_belonging[column_belonging==f_(tts[[1]] )$name] |> names()
+      
+      data_frame_to_attach = 
+        tts[[1]] |> 
+        select(!!f_(tts[[1]] )$symbol) |> 
+        suppressMessages() |> 
+        bind_cols(tts[[2]] |> select(all_of(rowData_additions))) |> 
+        distinct() 
+      
+      # Set row names
+      data_frame_to_attach = 
+        data_frame_to_attach |> 
+        select(-1) |> 
+        DataFrame(row.names = data_frame_to_attach |> pull(1)) 
+      
+      # Reorder
+      data_frame_to_attach = data_frame_to_attach[match(rownames(data_frame_to_attach), rownames(tts[[1]])), , drop=FALSE]
+      
+      # Attach
+      rowData(tts[[1]]) = cbind(rowData(tts[[1]]), data_frame_to_attach)
+      
+      tts[[1]]
+    }
+    
+    # If I DON'T have column corresponding go through tibble
+    else
+      tts[[1]] |> 
+          as_tibble(skip_GRanges = TRUE) |>
+          dplyr::bind_cols(tts[[2]], .id=.id) %>%
+          when(
+  
+              # If the column added are not sample-wise or feature-wise return tibble
+              (colnames(tts[[2]]) %in% c(
+                  get_subset_columns(., !!s_(tts[[1]])$symbol),
+                  get_subset_columns(., !!f_(tts[[1]])$symbol)
+              )
+              ) |> all() ~ update_SE_from_tibble(., tts[[1]], column_belonging = column_belonging),
+  
+              # Return tiblle
+              ~ {
                 warning("tidySummarizedExperiment says:",
-                    " The new columns do not include pure sample-wise",
-                    " or feature-wise. A data frame is returned for",
-                    " independent data analysis.")
+                        " The new columns do not include pure sample-wise",
+                        " or feature-wise. A data frame is returned for",
+                        " independent data analysis.")
                 (.)
-            }
-        )
+              }
+          )
+
 }
 
 bind_cols_ <- function(..., .id=NULL) { bind_cols_internal(..., .id=NULL) }
@@ -630,19 +683,28 @@ select.SummarizedExperiment <- function(.data, ...) {
         data.frame(row.names=pull(., !!f_(.data)$symbol)) |>
         select(-!!f_(.data)$symbol) |>
         DataFrame()
-  
+    
+    # If SE does not have rownames, 
+    # I have to take them our of here, otherwise count integration, 
+    # which is a matrix and behaved differently from DataFrame fails
+    if(rownames(.data) |> is.null()) rownames(row_data_DF)  = NULL
+    
     col_data_tibble <- 
         colData(.data) |> 
         as_tibble(rownames = s_(.data)$name) 
   
     col_data_DF <-
         col_data_tibble |>  
-        select(one_of(columns_query), !!s_(.data)$symbol) |>
-        suppressWarnings() %>% 
-        data.frame(row.names=pull(., !!s_(.data)$symbol)) |>
+        select(any_of(columns_query), !!s_(.data)$symbol) |>
+        data.frame(row.names=pull(col_data_tibble, !!s_(.data)$symbol)) |>
         select(-!!s_(.data)$symbol) |>
         DataFrame()
   
+    # If SE does not have rownames, 
+    # I have to take them our of here, otherwise count integration, 
+    # which is a matrix and behaved differently from DataFrame fails
+    if(colnames(.data) |> is.null()) rownames(col_data_DF)  = NULL
+    
     count_data <-
         assays(.data)@listData %>%
             .[names(assays(.data)@listData) %in% columns_query]
diff --git a/R/tidyr_methods.R b/R/tidyr_methods.R
@@ -41,7 +41,6 @@ unnest.tidySummarizedExperiment_nested <- function(data, cols, ...,
 unnest_summarized_experiment <- function(data, cols, ...,
     keep_empty=FALSE, ptype=NULL, names_sep=NULL,
     names_repair="check_unique", .drop, .id, .sep, .preserve) {
-
     . <- NULL
 
     # Need this otherwise crashes map
@@ -71,6 +70,7 @@ unnest_summarized_experiment <- function(data, cols, ...,
         )
     }
 
+
     # If both nested by transcript and sample
     if (s_(se)$name %in% colnames(data) &
         f_(se)$name %in% colnames(data) ) {
@@ -88,11 +88,11 @@ unnest_summarized_experiment <- function(data, cols, ...,
 
     
         # Mark if columns belong to feature or sample
-        my_unnested_tibble <-
-            mutate(data, !!cols := map(!!cols, ~ as_tibble(.x))) %>%
-            select(-suppressWarnings(one_of(s_(my_se)$name,
-                f_(my_se)$name))) %>%
-            unnest(!!cols)
+        my_unnested_tibble =
+          data |> 
+          mutate(!!cols := map(!!cols, ~ as_tibble(.x))) |>
+          select(-any_of(c(s_(my_se)$name, f_(my_se)$name))) |> 
+          unnest(!!cols)
     
         # Get which column is relative to feature or sample
         sample_columns <- my_unnested_tibble %>%
@@ -130,14 +130,14 @@ unnest_summarized_experiment <- function(data, cols, ...,
             
             # Attach back the columns used for nesting
             .data_ %>%
-              select(-!!cols, - any_of(c(s_(my_se)$name, f_(my_se)$name))) %>%
+              select(-!!cols, -any_of(c(s_(my_se)$name, f_(my_se)$name))) %>%
               slice(rep(as.integer(.y), ncol(.x) * nrow(.x))),
             
             # Column sample-wise or feature-wise
             column_belonging =
               source_column[
                 .data_ %>%
-                  select(-!!cols, - any_of(c(s_(my_se)$name, f_(my_se)$name))) %>%
+                  select(-!!cols, -any_of(c(s_(my_se)$name, f_(my_se)$name))) %>%
                   colnames()
               ]
           )
@@ -219,6 +219,14 @@ nest.SummarizedExperiment <- function(.data, ..., .names_sep=NULL) {
         .data <- ping_old_special_column_into_metadata(.data)
     }
     
+    # if the data does not have raw, names or clumn names
+    # we have to add them, otherwise the nesting and a nesting will get confused with 
+    # the link between Sample wise, columns, and Sample IDs
+    if(rownames(.data) |> is.null() | colnames(.data) |> is.null() )
+      warning("tidySummarizedExperiment says: the nesting and unnesting operations require row names and column names to avoid side-effects. Therefore, doors will be added as \"1\", \"2\", \"3\".")
+    if(rownames(.data) |> is.null() ) rownames(.data) = .data |> nrow() |> seq_len() |> as.character()
+    if(colnames(.data) |> is.null() ) colnames(.data) = .data |> ncol() |> seq_len() |> as.character()
+    
     my_data__ <- .data 
     
     # Names
@@ -256,12 +264,6 @@ nest.SummarizedExperiment <- function(.data, ..., .names_sep=NULL) {
             " We are working for optimising a generalised solution of nest().")
     }
 
-    # my_data__nested <-
-    #     my_data__ %>% 
-    #     # This is needed otherwise nest goes into loop and fails
-    #     as_tibble() %>%
-    #     tidyr::nest(...)
-     
     # If I nest only for .feature -> THIS WORKS ONLY WITH THE CHECK ABOVE
     if (feature_name %in% colnames(my_test_nest)) {
         return(
@@ -280,7 +282,7 @@ nest.SummarizedExperiment <- function(.data, ..., .names_sep=NULL) {
     
 
     my_data__ %>%
-        select(!!sample_symbol, !!feature_symbol, my_nesting_column) |> 
+        select(!!sample_symbol, !!feature_symbol, all_of(my_nesting_column)) |> 
         as_tibble() %>%
         tidyr::nest(...) |> 
 
@@ -311,15 +313,18 @@ nest.SummarizedExperiment <- function(.data, ..., .names_sep=NULL) {
                     # Here I am filtering because if I have 0 features this leads to failure
                     else my_transcripts= ..1 |> filter(!is.na(!!feature_symbol)) |>  pull(!!feature_symbol)
                     
-                    ###
-
+                    # if the summarised experiment does not have feature ID or Sample ID, 
+                    # convert back those to integers
+                    # Note to self: this is convoluted, because if feature ID and sample ID 
+                    # were to stay integers all along, these will not be needed
+                    if(rownames(my_data__) |> is.null()) my_transcripts = as.integer(my_transcripts)
+                    if(colnames(my_data__) |> is.null()) my_samples = as.integer(my_samples)
+                    
                     my_data__[unique(my_transcripts),unique(my_samples)] |>
-                      select(-one_of(
+                      select(-any_of(
                         my_nesting_column |> 
                           setdiff(c(sample_name, feature_name))
-                      )) |> 
-                      suppressWarnings()
-
+                      )) 
                  
                 }
             )
diff --git a/R/utilities.R b/R/utilities.R
@@ -749,7 +749,8 @@ get_count_datasets <- function(se) {
   se <- check_se_dimnames(se)
   
   # Join assays
-  map2( 
+  list_assays = 
+    map2( 
     assays(se, withDimnames = FALSE) %>% as.list(),
     names(assays(se)),
     ~ {
@@ -807,21 +808,27 @@ get_count_datasets <- function(se) {
       f_(se)$name %in% colnames(.x) %>% not ~ mutate(.x, !!f_(se)$symbol := as.character(NA)),
       s_(se)$name %in% colnames(.x) %>% not ~ mutate(.x, !!s_(se)$symbol := as.character(NA)),
       ~ .x
-    )) |> 
-    
-    when(
-      length(.)>0 ~ 
-        
-        reduce(., full_join, by = c(f_(se)$name, s_(se)$name)),
-      # reduce(., left_join, by = c(f_(se)$name, s_(se)$name)),
-      # bind_cols(.,  .name_repair = c("minimal")) %>% .[!duplicated(colnames(.))], 
-      ~ expand.grid(
-        rownames(se), colnames(se)
-      ) %>% 
-        setNames(c(f_(se)$name, s_(se)$name)) %>%
-        tibble::as_tibble()
-    ) 
+    )) 
+  
+  # If assays is non empty 
+  if(list_assays |> length() > 0)
+    list_assays |> 
+    reduce(full_join, by = c(f_(se)$name, s_(se)$name))
   
+  # If assays is empty 
+  else {
+    
+    # If I don't have row column names
+    if(se |> rownames() |> is.null()) rn = nrow(se) |> seq_len() |> as.character()
+    else rn = rownames(se)
+    if(se |> colnames() |> is.null()) cn = ncol(se) |> seq_len() |> as.character()
+    else cn = colnames(se)
+    
+    expand.grid(  rn, cn  ) |> 
+             setNames(c(f_(se)$name, s_(se)$name)) |> 
+             tibble::as_tibble()
+  }
+   
   
 }