diff --git a/r/R/feather.R b/r/R/feather.R index 4008041106d..1ae2c88ee3d 100644 --- a/r/R/feather.R +++ b/r/R/feather.R @@ -15,56 +15,25 @@ # specific language governing permissions and limitations # under the License. -#' Write a Feather file (an Arrow IPC file) +#' Write a Feather file (deprecated) #' -#' Feather provides binary columnar serialization for data frames. -#' It is designed to make reading and writing data frames efficient, -#' and to make sharing data across data analysis languages easy. -#' [write_feather()] can write both the Feather Version 1 (V1), -#' a legacy version available starting in 2016, and the Version 2 (V2), -#' which is the Apache Arrow IPC file format. -#' The default version is V2. -#' V1 files are distinct from Arrow IPC files and lack many features, -#' such as the ability to store all Arrow data tyeps, and compression support. -#' [write_ipc_file()] can only write V2 files. +#' @description +#' `write_feather()` is deprecated and will be removed in a future release. +#' Use [write_ipc_file()] instead. #' -#' @param x `data.frame`, [RecordBatch], or [Table] -#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file -#' system (`SubTreeFileSystem`) +#' Column-oriented file format designed for fast reading and writing +#' of data frames. Feather V2 is the Arrow IPC file format. +#' Feather V1 was a legacy format available starting in 2016 and lacks many +#' features, such as the ability to store all Arrow data types, and compression +#' support. The Feather V1 format is no longer supported. +#' +#' @inheritParams write_ipc_file #' @param version integer Feather file version, Version 1 or Version 2. Version 2 is the default. -#' @param chunk_size For V2 files, the number of rows that each chunk of data -#' should have in the file. Use a smaller `chunk_size` when you need faster -#' random row access. Default is 64K. This option is not supported for V1. -#' @param compression Name of compression codec to use, if any. Default is -#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise -#' "uncompressed". "zstd" is the other available codec and generally has better -#' compression ratios in exchange for slower read and write performance. -#' "lz4" is shorthand for the "lz4_frame" codec. -#' See [codec_is_available()] for details. -#' `TRUE` and `FALSE` can also be used in place of "default" and "uncompressed". -#' This option is not supported for V1. -#' @param compression_level If `compression` is "zstd", you may -#' specify an integer compression level. If omitted, the compression codec's -#' default compression level is used. #' #' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream], #' the stream will be left open. #' @export -#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data. -#' @seealso [Schema] for information about schemas and metadata handling. -#' @examples -#' # We recommend the ".arrow" extension for Arrow IPC files (Feather V2). -#' tf1 <- tempfile(fileext = ".feather") -#' tf2 <- tempfile(fileext = ".arrow") -#' tf3 <- tempfile(fileext = ".arrow") -#' on.exit({ -#' unlink(tf1) -#' unlink(tf2) -#' unlink(tf3) -#' }) -#' write_feather(mtcars, tf1, version = 1) -#' write_feather(mtcars, tf2) -#' write_ipc_file(mtcars, tf3) +#' @seealso [write_ipc_file()] #' @include arrow-object.R write_feather <- function( x, @@ -73,6 +42,38 @@ write_feather <- function( chunk_size = 65536L, compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), compression_level = NULL +) { + if (version == 2) { + .Deprecated( + "write_ipc_file", + msg = "write_feather(version = 2) has been superseded by write_ipc_file()." + ) + } else { + .Deprecated( + "write_ipc_file", + msg = paste( + "Feather V1 is no longer supported;", + "use `write_ipc_file()` to write Arrow IPC format (equivalent to Feather V2)." + ) + ) + } + write_ipc_impl( + x = x, + sink = sink, + version = version, + chunk_size = chunk_size, + compression = compression, + compression_level = compression_level + ) +} + +write_ipc_impl <- function( + x, + sink, + version = 2, + chunk_size = 65536L, + compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), + compression_level = NULL ) { # Handle and validate options before touching data version <- as.integer(version) @@ -103,18 +104,13 @@ write_feather <- function( compression_level <- as.integer(compression_level) # Now make sure that options make sense together if (version == 1) { - if (chunk_size != 65536L) { - stop("Feather version 1 does not support the 'chunk_size' option", call. = FALSE) - } - if (compression != "uncompressed") { - stop("Feather version 1 does not support the 'compression' option", call. = FALSE) - } - if (compression_level != -1L) { - stop("Feather version 1 does not support the 'compression_level' option", call. = FALSE) - } + check_feather_v1_options(chunk_size, compression, compression_level) } if (compression != "zstd" && compression_level != -1L) { - stop("Can only specify a 'compression_level' when 'compression' is 'zstd'", call. = FALSE) + stop( + "Can only specify a 'compression_level' when 'compression' is 'zstd'", + call. = FALSE + ) } # Finally, add 1 to version because 2 means V1 and 3 means V2 :shrug: version <- version + 1L @@ -133,12 +129,74 @@ write_feather <- function( sink <- make_output_stream(sink) on.exit(sink$close()) } - ipc___WriteFeather__Table(sink, x, version, chunk_size, compression, compression_level) + ipc___WriteFeather__Table( + sink, + x, + version, + chunk_size, + compression, + compression_level + ) invisible(x_out) } -#' @rdname write_feather +check_feather_v1_options <- function( + chunk_size, + compression, + compression_level +) { + if (chunk_size != 65536L) { + stop( + "Feather version 1 does not support the 'chunk_size' option", + call. = FALSE + ) + } + if (compression != "uncompressed") { + stop( + "Feather version 1 does not support the 'compression' option", + call. = FALSE + ) + } + if (compression_level != -1L) { + stop( + "Feather version 1 does not support the 'compression_level' option", + call. = FALSE + ) + } +} + + +#' Write an Arrow IPC file +#' +#' The Arrow IPC file format provides binary columnar serialization for data frames. +#' It is designed to make reading and writing data frames efficient, +#' and to make sharing data across data analysis languages easy. +#' +#' @param x `data.frame`, [RecordBatch], or [Table] +#' @param sink A string file path, connection, URI, or [OutputStream], or path in a file +#' system (`SubTreeFileSystem`) +#' @param chunk_size The number of rows that each chunk of data should have in the file. +#' Use a smaller `chunk_size` when you need faster random row access. Default is 64K. +#' @param compression Name of compression codec to use, if any. Default is +#' "lz4" if LZ4 is available in your build of the Arrow C++ library, otherwise +#' "uncompressed". "zstd" is the other available codec and generally has better +#' compression ratios in exchange for slower read and write performance. +#' "lz4" is shorthand for the "lz4_frame" codec. +#' See [codec_is_available()] for details. +#' `TRUE` and `FALSE` can also be used in place of "default" and "uncompressed". +#' @param compression_level If `compression` is "zstd", you may +#' specify an integer compression level. If omitted, the compression codec's +#' default compression level is used. +#' +#' @return The input `x`, invisibly. Note that if `sink` is an [OutputStream], +#' the stream will be left open. #' @export +#' @seealso [RecordBatchWriter] for lower-level access to writing Arrow IPC data. +#' @seealso [Schema] for information about schemas and metadata handling. +#' @examples +#' tf <- tempfile(fileext = ".arrow") +#' on.exit(unlink(tf)) +#' write_ipc_file(mtcars, tf) write_ipc_file <- function( x, sink, @@ -146,20 +204,55 @@ write_ipc_file <- function( compression = c("default", "lz4", "lz4_frame", "uncompressed", "zstd"), compression_level = NULL ) { - mc <- match.call() - mc$version <- 2 - mc[[1]] <- get("write_feather", envir = asNamespace("arrow")) - eval.parent(mc) + write_ipc_impl( + x = x, + sink = sink, + version = 2, + chunk_size = chunk_size, + compression = compression, + compression_level = compression_level + ) +} + +#' Read a Feather file (deprecated) +#' +#' @description +#' `read_feather()` is deprecated and will be removed in a future release. +#' Use [read_ipc_file()] instead. +#' +#' `read_feather()` can read both the Feather V1 format (a legacy format which +#' is also being deprecated) and the Feather V2 format (which is the Arrow IPC format). +#' `read_ipc_file()` can also read both formats. +#' +#' @inheritParams read_ipc_file +#' +#' @return A `tibble` if `as_data_frame` is `TRUE` (the default), or an +#' Arrow [Table] otherwise +#' +#' @export +#' @seealso [read_ipc_file()] +read_feather <- function( + file, + col_select = NULL, + as_data_frame = TRUE, + mmap = TRUE +) { + .Deprecated("read_ipc_file") + read_ipc_file( + file = file, + col_select = {{ col_select }}, + as_data_frame = as_data_frame, + mmap = mmap + ) } -#' Read a Feather file (an Arrow IPC file) +#' Read an Arrow IPC file #' -#' Feather provides binary columnar serialization for data frames. +#' The Arrow IPC file format provides binary columnar serialization for data frames. #' It is designed to make reading and writing data frames efficient, #' and to make sharing data across data analysis languages easy. -#' [read_feather()] can read both the Feather Version 1 (V1), a legacy version available starting in 2016, -#' and the Version 2 (V2), which is the Apache Arrow IPC file format. -#' [read_ipc_file()] is an alias of [read_feather()]. +#' +#' This function can also read the legacy Feather V1 format. #' #' @inheritParams read_ipc_stream #' @inheritParams read_delim_arrow @@ -171,15 +264,19 @@ write_ipc_file <- function( #' @export #' @seealso [FeatherReader] and [RecordBatchReader] for lower-level access to reading Arrow IPC data. #' @examples -#' # We recommend the ".arrow" extension for Arrow IPC files (Feather V2). #' tf <- tempfile(fileext = ".arrow") #' on.exit(unlink(tf)) -#' write_feather(mtcars, tf) -#' df <- read_feather(tf) +#' write_ipc_file(mtcars, tf) +#' df <- read_ipc_file(tf) #' dim(df) #' # Can select columns -#' df <- read_feather(tf, col_select = starts_with("d")) -read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = TRUE) { +#' df <- read_ipc_file(tf, col_select = starts_with("d")) +read_ipc_file <- function( + file, + col_select = NULL, + as_data_frame = TRUE, + mmap = TRUE +) { if (!inherits(file, "RandomAccessFile")) { # Compression is handled inside the IPC file format, so we don't need # to detect from the file extension and wrap in a CompressedInputStream @@ -210,10 +307,6 @@ read_feather <- function(file, col_select = NULL, as_data_frame = TRUE, mmap = T out } -#' @rdname read_feather -#' @export -read_ipc_file <- read_feather - #' @title FeatherReader class #' @rdname FeatherReader #' @name FeatherReader diff --git a/r/tests/testthat/helper-filesystems.R b/r/tests/testthat/helper-filesystems.R index 7b37abf764b..aa889c9a8ad 100644 --- a/r/tests/testthat/helper-filesystems.R +++ b/r/tests/testthat/helper-filesystems.R @@ -27,15 +27,15 @@ test_filesystem <- function(name, fs, path_formatter, uri_formatter) { # NOTE: it's important that we label these tests with name of filesystem so # that we can differentiate the different calls to these test in the output. - test_that(sprintf("read/write Feather on %s using URIs", name), { - write_feather(example_data, uri_formatter("test.feather")) - expect_identical(read_feather(uri_formatter("test.feather")), example_data) + test_that(sprintf("read/write IPC on %s using URIs", name), { + write_ipc_file(example_data, uri_formatter("test.arrow")) + expect_identical(read_ipc_file(uri_formatter("test.arrow")), example_data) }) - test_that(sprintf("read/write Feather on %s using Filesystem", name), { - write_feather(example_data, fs$path(path_formatter("test2.feather"))) + test_that(sprintf("read/write IPC on %s using Filesystem", name), { + write_ipc_file(example_data, fs$path(path_formatter("test2.arrow"))) expect_identical( - read_feather(fs$path(path_formatter("test2.feather"))), + read_ipc_file(fs$path(path_formatter("test2.arrow"))), example_data ) }) diff --git a/r/tests/testthat/test-Array.R b/r/tests/testthat/test-Array.R index 96e80a499ae..582e2875ec8 100644 --- a/r/tests/testthat/test-Array.R +++ b/r/tests/testthat/test-Array.R @@ -334,10 +334,10 @@ test_that("Timezone handling in Arrow roundtrip (ARROW-3543)", { # Confirming that the columns are in fact different expect_false(any(df$no_tz == df$yes_tz)) } - feather_file <- tempfile() - on.exit(unlink(feather_file)) - write_feather(df, feather_file) - expect_identical(read_feather(feather_file), df) + ipc_file <- tempfile() + on.exit(unlink(ipc_file)) + write_ipc_file(df, ipc_file) + expect_identical(read_ipc_file(ipc_file), df) }) test_that("array supports integer64", { diff --git a/r/tests/testthat/test-buffer.R b/r/tests/testthat/test-buffer.R index 4b67cbceb68..7bfcbd42c42 100644 --- a/r/tests/testthat/test-buffer.R +++ b/r/tests/testthat/test-buffer.R @@ -69,7 +69,7 @@ test_that("can read remaining bytes of a RandomAccessFile", { tab <- Table$create(!!!tbl) tf <- tempfile() - all_bytes <- write_feather(tab, tf) + all_bytes <- write_ipc_file(tab, tf) file <- ReadableFile$create(tf) expect_equal(file$tell(), 0) diff --git a/r/tests/testthat/test-dataset-write.R b/r/tests/testthat/test-dataset-write.R index d675e4950d2..2631be159f4 100644 --- a/r/tests/testthat/test-dataset-write.R +++ b/r/tests/testthat/test-dataset-write.R @@ -62,7 +62,7 @@ test_that("Writing a dataset: CSV->IPC", { ) # Check whether "int" is present in the files or just in the dirs - first <- read_feather( + first <- read_ipc_file( dir(dst_dir, pattern = ".arrow$", recursive = TRUE, full.names = TRUE)[1], as_data_frame = FALSE ) diff --git a/r/tests/testthat/test-dataset.R b/r/tests/testthat/test-dataset.R index 31457eee8a3..1ab70aa2f24 100644 --- a/r/tests/testthat/test-dataset.R +++ b/r/tests/testthat/test-dataset.R @@ -41,8 +41,8 @@ test_that("Setup (putting data in the dir)", { # Now, an IPC format dataset dir.create(file.path(ipc_dir, 3)) dir.create(file.path(ipc_dir, 4)) - write_feather(df1, file.path(ipc_dir, 3, "file1.arrow")) - write_feather(df2, file.path(ipc_dir, 4, "file2.arrow")) + write_ipc_file(df1, file.path(ipc_dir, 3, "file1.arrow")) + write_ipc_file(df2, file.path(ipc_dir, 4, "file2.arrow")) expect_length(dir(ipc_dir, recursive = TRUE), 2) }) @@ -98,7 +98,7 @@ test_that("URI-decoding with directory partitioning", { selector <- FileSelector$create(root, recursive = TRUE) dir1 <- file.path(root, "2021-05-04 00%3A00%3A00", "%24") dir.create(dir1, recursive = TRUE) - write_feather(df1, file.path(dir1, "data.feather")) + write_ipc_file(df1, file.path(dir1, "data.arrow")) partitioning <- DirectoryPartitioning$create( schema(date = timestamp(unit = "s"), string = utf8()) @@ -178,7 +178,7 @@ test_that("URI-decoding with hive partitioning", { selector <- FileSelector$create(root, recursive = TRUE) dir1 <- file.path(root, "date=2021-05-04 00%3A00%3A00", "string=%24") dir.create(dir1, recursive = TRUE) - write_feather(df1, file.path(dir1, "data.feather")) + write_ipc_file(df1, file.path(dir1, "data.arrow")) partitioning <- hive_partition( date = timestamp(unit = "s"), @@ -254,7 +254,7 @@ test_that("URI-decoding with hive partitioning with key encoded", { selector <- FileSelector$create(root, recursive = TRUE) dir1 <- file.path(root, "test%20key=2021-05-04 00%3A00%3A00", "test%20key1=%24") dir.create(dir1, recursive = TRUE) - write_feather(df1, file.path(dir1, "data.feather")) + write_ipc_file(df1, file.path(dir1, "data.arrow")) partitioning <- hive_partition( `test key` = timestamp(unit = "s"), diff --git a/r/tests/testthat/test-extension.R b/r/tests/testthat/test-extension.R index c4fe36c0f41..d82c96fa5e7 100644 --- a/r/tests/testthat/test-extension.R +++ b/r/tests/testthat/test-extension.R @@ -191,8 +191,8 @@ test_that("vctrs extension type works", { tf <- tempfile() on.exit(unlink(tf)) - write_feather(arrow_table(col = array_in), tf) - table_out <- read_feather(tf, as_data_frame = FALSE) + write_ipc_file(arrow_table(col = array_in), tf) + table_out <- read_ipc_file(tf, as_data_frame = FALSE) array_out <- table_out$col$chunk(0) expect_r6_class(array_out$type, "VctrsExtensionType") diff --git a/r/tests/testthat/test-feather.R b/r/tests/testthat/test-feather.R index 188a562fe81..8a68d4d0eff 100644 --- a/r/tests/testthat/test-feather.R +++ b/r/tests/testthat/test-feather.R @@ -324,8 +324,16 @@ test_that("Error is created when feather reads a parquet file", { ) }) -test_that("The read_ipc_file function is an alias of read_feather", { - expect_identical(read_ipc_file, read_feather) +test_that("read_feather calls read_ipc_file", { + tf <- tempfile() + on.exit(unlink(tf)) + write_ipc_file(example_data, tf) + expect_warning( + result_feather <- read_feather(tf), + "deprecated" + ) + result_ipc <- read_ipc_file(tf) + expect_identical(result_feather, result_ipc) }) test_that("Can read Feather files from a URL", { diff --git a/r/tests/testthat/test-metadata.R b/r/tests/testthat/test-metadata.R index 90b9f599ec7..236a48735b7 100644 --- a/r/tests/testthat/test-metadata.R +++ b/r/tests/testthat/test-metadata.R @@ -260,20 +260,20 @@ test_that("R metadata roundtrip via parquet", { expect_identical(read_parquet(tf), example_with_metadata) }) -test_that("R metadata roundtrip via feather", { +test_that("R metadata roundtrip via IPC", { tf <- tempfile() on.exit(unlink(tf)) - write_feather(example_with_metadata, tf) - expect_identical(read_feather(tf), example_with_metadata) + write_ipc_file(example_with_metadata, tf) + expect_identical(read_ipc_file(tf), example_with_metadata) }) -test_that("haven types roundtrip via feather", { +test_that("haven types roundtrip via IPC", { tf <- tempfile() on.exit(unlink(tf)) - write_feather(haven_data, tf) - expect_identical(read_feather(tf), haven_data) + write_ipc_file(haven_data, tf) + expect_identical(read_ipc_file(tf), haven_data) }) test_that("Date/time type roundtrip", { diff --git a/r/tests/testthat/test-read-record-batch.R b/r/tests/testthat/test-read-record-batch.R index 7f310e8fc91..87169faac02 100644 --- a/r/tests/testthat/test-read-record-batch.R +++ b/r/tests/testthat/test-read-record-batch.R @@ -37,7 +37,7 @@ test_that("RecordBatchFileWriter / RecordBatchFileReader roundtrips", { writer$close() stream$close() - expect_equal(read_feather(tf, as_data_frame = FALSE, mmap = FALSE), tab) + expect_equal(read_ipc_file(tf, as_data_frame = FALSE, mmap = FALSE), tab) # Make sure connections are closed expect_error(file.remove(tf), NA) skip_on_os("windows") # This should pass, we've closed the stream diff --git a/r/tests/testthat/test-read-write.R b/r/tests/testthat/test-read-write.R index ac156c643db..1714da1bcf2 100644 --- a/r/tests/testthat/test-read-write.R +++ b/r/tests/testthat/test-read-write.R @@ -65,9 +65,9 @@ test_that("table round trip", { expect_equal(chunked_array_raw$chunk(i - 1L), chunks_raw[[i]]) } tf <- tempfile() - write_feather(tbl, tf) + write_ipc_file(tbl, tf) - res <- read_feather(tf) + res <- read_ipc_file(tf) expect_identical(tbl$int, res$int) expect_identical(tbl$dbl, res$dbl) expect_identical(as.integer(tbl$raw), res$raw) @@ -98,9 +98,9 @@ test_that("table round trip handles NA in integer and numeric", { expect_equal(tab$column(2)$type, uint8()) tf <- tempfile() - write_feather(tbl, tf) + write_ipc_file(tbl, tf) - res <- read_feather(tf) + res <- read_ipc_file(tf) expect_identical(tbl$int, res$int) expect_identical(tbl$dbl, res$dbl) expect_identical(as.integer(tbl$raw), res$raw) diff --git a/r/tests/testthat/test-s3.R b/r/tests/testthat/test-s3.R index c8c0f8607c4..ea7a91cd951 100644 --- a/r/tests/testthat/test-s3.R +++ b/r/tests/testthat/test-s3.R @@ -45,9 +45,9 @@ if (run_these) { now <- as.numeric(Sys.time()) on.exit(bucket$DeleteDir(now)) - test_that("read/write Feather on S3", { - write_feather(example_data, bucket_uri(now, "test.feather")) - expect_identical(read_feather(bucket_uri(now, "test.feather")), example_data) + test_that("read/write IPC on S3", { + write_ipc_file(example_data, bucket_uri(now, "test.arrow")) + expect_identical(read_ipc_file(bucket_uri(now, "test.arrow")), example_data) }) test_that("read/write Parquet on S3", { diff --git a/r/tests/testthat/test-utf.R b/r/tests/testthat/test-utf.R index 26ee03485dd..41de2e6ffd1 100644 --- a/r/tests/testthat/test-utf.R +++ b/r/tests/testthat/test-utf.R @@ -64,9 +64,9 @@ test_that("We handle non-UTF strings", { expect_equal_data_frame(record_batch(df_struct, schema = df_struct_schema), df_struct) # Serialization - feather_file <- tempfile() - write_feather(df_struct, feather_file) - expect_identical(read_feather(feather_file), df_struct) + ipc_file <- tempfile() + write_ipc_file(df_struct, ipc_file) + expect_identical(read_ipc_file(ipc_file), df_struct) if (arrow_with_parquet()) { parquet_file <- tempfile()