googleapis
diff --git a/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion b/‎.pre-commit-config.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.rst‎
Lines changed: 3 additions & 4 deletions b/‎README.rst‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎bigframes/_config/display_options.py‎
Lines changed: 4 additions & 0 deletions b/‎bigframes/_config/display_options.py‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎bigframes/core/blocks.py‎
Lines changed: 23 additions & 18 deletions b/‎bigframes/core/blocks.py‎
Lines changed: 23 additions & 18 deletions
diff --git a/‎bigframes/core/indexes/index.py‎
Lines changed: 12 additions & 1 deletion b/‎bigframes/core/indexes/index.py‎
Lines changed: 12 additions & 1 deletion
diff --git a/‎bigframes/dataframe.py‎
Lines changed: 84 additions & 0 deletions b/‎bigframes/dataframe.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎bigframes/dtypes.py‎
Lines changed: 13 additions & 0 deletions b/‎bigframes/dtypes.py‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎bigframes/remote_function.py‎
Lines changed: 13 additions & 6 deletions b/‎bigframes/remote_function.py‎
Lines changed: 13 additions & 6 deletions
diff --git a/‎bigframes/session/__init__.py‎
Lines changed: 0 additions & 14 deletions b/‎bigframes/session/__init__.py‎
Lines changed: 0 additions & 14 deletions
@@ -38,4 +38,4 @@ repos:
     rev: v1.1.1
     hooks:
     -   id: mypy
-        additional_dependencies: [types-requests]
+        additional_dependencies: [types-requests, types-tabulate]
@@ -267,10 +267,9 @@ definition. To view and manage connections, do the following:
 3. In the Explorer pane, expand that project and then expand External connections.
 
 BigQuery remote functions are created in the dataset you specify, or
-in a dataset with the name ``bigframes_temp_location``, where location is
-the location used by the BigQuery DataFrames session. For example,
-``bigframes_temp_us_central1``. To view and manage remote functions, do
-the following:
+in a special type of `hidden dataset <https://cloud.google.com/bigquery/docs/datasets#hidden_datasets>`__
+referred to as an anonymous dataset. To view and manage remote functions created
+in a user provided dataset, do the following:
 
 1. Go to `BigQuery in the Google Cloud Console <https://console.cloud.google.com/bigquery>`__.
 2. Select the project in which you created the remote function.
 
@@ -32,6 +32,10 @@ class DisplayOptions:
     progress_bar: Optional[str] = "auto"
     repr_mode: Literal["head", "deferred"] = "head"
 
+    max_info_columns: int = 100
+    max_info_rows: Optional[int] = 200000
+    memory_usage: bool = True
+
 
 @contextlib.contextmanager
 def pandas_repr(display_options: DisplayOptions):
 
@@ -389,23 +389,6 @@ def to_pandas(
         ordered: bool = True,
     ) -> Tuple[pd.DataFrame, bigquery.QueryJob]:
         """Run query and download results as a pandas DataFrame."""
-        if max_download_size is None:
-            max_download_size = bigframes.options.sampling.max_download_size
-        if sampling_method is None:
-            sampling_method = (
-                bigframes.options.sampling.sampling_method
-                if bigframes.options.sampling.sampling_method is not None
-                else _UNIFORM
-            )
-        if random_state is None:
-            random_state = bigframes.options.sampling.random_state
-
-        sampling_method = sampling_method.lower()
-        if sampling_method not in _SAMPLING_METHODS:
-            raise NotImplementedError(
-                f"The downsampling method {sampling_method} is not implemented, "
-                f"please choose from {','.join(_SAMPLING_METHODS)}."
-            )
 
         df, _, query_job = self._compute_and_count(
             value_keys=value_keys,
@@ -453,6 +436,28 @@ def _compute_and_count(
     ) -> Tuple[pd.DataFrame, int, bigquery.QueryJob]:
         """Run query and download results as a pandas DataFrame. Return the total number of results as well."""
         # TODO(swast): Allow for dry run and timeout.
+        enable_downsampling = (
+            True
+            if sampling_method is not None
+            else bigframes.options.sampling.enable_downsampling
+        )
+
+        max_download_size = (
+            max_download_size or bigframes.options.sampling.max_download_size
+        )
+
+        random_state = random_state or bigframes.options.sampling.random_state
+
+        if sampling_method is None:
+            sampling_method = bigframes.options.sampling.sampling_method or _UNIFORM
+        sampling_method = sampling_method.lower()
+
+        if sampling_method not in _SAMPLING_METHODS:
+            raise NotImplementedError(
+                f"The downsampling method {sampling_method} is not implemented, "
+                f"please choose from {','.join(_SAMPLING_METHODS)}."
+            )
+
         expr = self._apply_value_keys_to_expr(value_keys=value_keys)
 
         results_iterator, query_job = expr.start_query(
@@ -469,7 +474,7 @@ def _compute_and_count(
         )
 
         if fraction < 1:
-            if not bigframes.options.sampling.enable_downsampling:
+            if not enable_downsampling:
                 raise RuntimeError(
                     f"The data size ({table_size:.2f} MB) exceeds the maximum download limit of "
                     f"{max_download_size} MB. You can:\n\t* Enable downsampling in global options:\n"
 
@@ -155,6 +155,14 @@ def _block(self) -> blocks.Block:
     def T(self) -> Index:
         return self.transpose()
 
+    def _memory_usage(self) -> int:
+        (n_rows,) = self.shape
+        return sum(
+            self.dtypes.map(
+                lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
+            )
+        )
+
     def transpose(self) -> Index:
         return self
 
@@ -326,7 +334,10 @@ def _apply_aggregation(self, op: agg_ops.AggregateOp) -> typing.Any:
 
     def __getitem__(self, key: int) -> typing.Any:
         if isinstance(key, int):
-            result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
+            if key != -1:
+                result_pd_df, _ = self._block.slice(key, key + 1, 1).to_pandas()
+            else:  # special case, want [-1:] instead of [-1:0]
+                result_pd_df, _ = self._block.slice(key).to_pandas()
             if result_pd_df.empty:
                 raise IndexError("single positional indexer is out-of-bounds")
             return result_pd_df.index[0]
 
@@ -18,6 +18,7 @@
 
 import datetime
 import re
+import sys
 import textwrap
 import typing
 from typing import (
@@ -36,6 +37,7 @@
 import google.cloud.bigquery as bigquery
 import numpy
 import pandas
+import tabulate
 
 import bigframes
 import bigframes._config.display_options as display_options
@@ -350,6 +352,88 @@ def query_job(self) -> Optional[bigquery.QueryJob]:
             self._set_internal_query_job(self._compute_dry_run())
         return self._query_job
 
+    def memory_usage(self, index: bool = True):
+        n_rows, _ = self.shape
+        # like pandas, treat all variable-size objects as just 8-byte pointers, ignoring actual object
+        column_sizes = self.dtypes.map(
+            lambda dtype: bigframes.dtypes.DTYPE_BYTE_SIZES.get(dtype, 8) * n_rows
+        )
+        if index:
+            index_size = pandas.Series([self.index._memory_usage()], index=["Index"])
+            column_sizes = pandas.concat([index_size, column_sizes])
+        return column_sizes
+
+    def info(
+        self,
+        verbose: Optional[bool] = None,
+        buf=None,
+        max_cols: Optional[int] = None,
+        memory_usage: Optional[bool] = None,
+        show_counts: Optional[bool] = None,
+    ):
+        obuf = buf or sys.stdout
+
+        n_rows, n_columns = self.shape
+
+        max_cols = (
+            max_cols
+            if max_cols is not None
+            else bigframes.options.display.max_info_columns
+        )
+
+        show_all_columns = verbose if verbose is not None else (n_columns < max_cols)
+
+        obuf.write(f"{type(self)}\n")
+
+        index_type = "MultiIndex" if self.index.nlevels > 1 else "Index"
+
+        # These accessses are kind of expensive, maybe should try to skip?
+        first_indice = self.index[0]
+        last_indice = self.index[-1]
+        obuf.write(f"{index_type}: {n_rows} entries, {first_indice} to {last_indice}\n")
+
+        dtype_strings = self.dtypes.astype("string")
+        if show_all_columns:
+            obuf.write(f"Data columns (total {n_columns} columns):\n")
+            column_info = self.columns.to_frame(name="Column")
+
+            max_rows = bigframes.options.display.max_info_rows
+            too_many_rows = n_rows > max_rows if max_rows is not None else False
+
+            if show_counts if show_counts is not None else (not too_many_rows):
+                non_null_counts = self.count().to_pandas()
+                column_info["Non-Null Count"] = non_null_counts.map(
+                    lambda x: f"{int(x)} non-null"
+                )
+
+            column_info["Dtype"] = dtype_strings
+
+            column_info = column_info.reset_index(drop=True)
+            column_info.index.name = "#"
+
+            column_info_formatted = tabulate.tabulate(column_info, headers="keys")  # type: ignore
+            obuf.write(column_info_formatted)
+            obuf.write("\n")
+
+        else:  # Just number of columns and first, last
+            obuf.write(
+                f"Columns: {n_columns} entries, {self.columns[0]} to {self.columns[-1]}\n"
+            )
+        dtype_counts = dtype_strings.value_counts().sort_index(ascending=True).items()
+        dtype_counts_formatted = ", ".join(
+            f"{dtype}({count})" for dtype, count in dtype_counts
+        )
+        obuf.write(f"dtypes: {dtype_counts_formatted}\n")
+
+        show_memory = (
+            memory_usage
+            if memory_usage is not None
+            else bigframes.options.display.memory_usage
+        )
+        if show_memory:
+            # TODO: Convert to different units (kb, mb, etc.)
+            obuf.write(f"memory usage: {self.memory_usage().sum()} bytes\n")
+
     def _set_internal_query_job(self, query_job: bigquery.QueryJob):
         self._query_job = query_job
 
 
@@ -143,6 +143,19 @@
 # "string" and "string[pyarrow] are accepted"
 BIGFRAMES_STRING_TO_BIGFRAMES["string[pyarrow]"] = pd.StringDtype(storage="pyarrow")
 
+# For the purposes of dataframe.memory_usage
+# https://cloud.google.com/bigquery/docs/reference/standard-sql/data-types#data_type_sizes
+DTYPE_BYTE_SIZES = {
+    pd.BooleanDtype(): 1,
+    pd.Int64Dtype(): 8,
+    pd.Float32Dtype(): 8,
+    pd.StringDtype(): 8,
+    pd.ArrowDtype(pa.time64("us")): 8,
+    pd.ArrowDtype(pa.timestamp("us")): 8,
+    pd.ArrowDtype(pa.timestamp("us", tz="UTC")): 8,
+    pd.ArrowDtype(pa.date32()): 8,
+}
+
 
 def ibis_dtype_to_bigframes_dtype(
     ibis_dtype: ibis_dtypes.DataType,
 
@@ -188,6 +188,7 @@ def create_bq_remote_function(
         # https://cloud.google.com/bigquery/docs/reference/standard-sql/remote-functions#create_a_remote_function_2
         bq_function_args = []
         bq_function_return_type = BigQueryType.from_ibis(output_type)
+
         # We are expecting the input type annotations to be 1:1 with the input args
         for idx, name in enumerate(input_args):
             bq_function_args.append(
@@ -204,14 +205,22 @@ def create_bq_remote_function(
 
         logger.info(f"Creating BQ remote function: {create_function_ddl}")
 
-        # Make sure the dataset exists
+        # Make sure the dataset exists. I.e. if it doesn't exist, go ahead and
+        # create it
         dataset = bigquery.Dataset(
             bigquery.DatasetReference.from_string(
                 self._bq_dataset, default_project=self._gcp_project_id
             )
         )
         dataset.location = self._bq_location
-        self._bq_client.create_dataset(dataset, exists_ok=True)
+        try:
+            # This check does not require bigquery.datasets.create IAM
+            # permission. So, if the data set already exists, then user can work
+            # without having that permission.
+            self._bq_client.get_dataset(dataset)
+        except google.api_core.exceptions.NotFound:
+            # This requires bigquery.datasets.create IAM permission
+            self._bq_client.create_dataset(dataset, exists_ok=True)
 
         # TODO: Use session._start_query() so we get progress bar
         query_job = self._bq_client.query(create_function_ddl)  # Make an API request.
@@ -610,7 +619,7 @@ def get_routine_reference(
             raise DatasetMissingError
 
         dataset_ref = bigquery.DatasetReference(
-            bigquery_client.project, session._session_dataset_id
+            bigquery_client.project, session._anonymous_dataset.dataset_id
         )
         return dataset_ref.routine(routine_ref_str)
 
@@ -778,9 +787,7 @@ def remote_function(
             dataset, default_project=bigquery_client.project
         )
     else:
-        dataset_ref = bigquery.DatasetReference.from_string(
-            session._session_dataset_id, default_project=bigquery_client.project
-        )
+        dataset_ref = session._anonymous_dataset
 
     bq_location, cloud_function_region = get_remote_function_locations(
         bigquery_client.location
 
@@ -198,13 +198,6 @@ def cloudfunctionsclient(self):
     def resourcemanagerclient(self):
         return self._clients_provider.resourcemanagerclient
 
-    @property
-    def _session_dataset_id(self):
-        """A dataset for storing temporary objects local to the session
-        This is a workaround for remote functions that do not
-        yet support session-temporary instances."""
-        return self._session_dataset.dataset_id
-
     @property
     def _project(self):
         return self.bqclient.project
@@ -229,13 +222,6 @@ def _create_bq_datasets(self):
             query_destination.dataset_id,
         )
 
-        # Dataset for storing remote functions, which don't yet
-        # support proper session temporary storage yet
-        self._session_dataset = bigquery.Dataset(
-            f"{self.bqclient.project}.bigframes_temp_{self._location.lower().replace('-', '_')}"
-        )
-        self._session_dataset.location = self._location
-
     def close(self):
         """No-op. Temporary resources are deleted after 7 days."""