OpenProteinAI
diff --git a/‎openprotein/__init__.py‎
Lines changed: 47 additions & 3 deletions b/‎openprotein/__init__.py‎
Lines changed: 47 additions & 3 deletions
diff --git a/‎openprotein/align/align.py‎
Lines changed: 2 additions & 2 deletions b/‎openprotein/align/align.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎openprotein/base.py‎
Lines changed: 23 additions & 32 deletions b/‎openprotein/base.py‎
Lines changed: 23 additions & 32 deletions
diff --git a/‎openprotein/chains.py‎
Lines changed: 9 additions & 85 deletions b/‎openprotein/chains.py‎
Lines changed: 9 additions & 85 deletions
diff --git a/‎openprotein/common/model_metadata.py‎
Lines changed: 4 additions & 4 deletions b/‎openprotein/common/model_metadata.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎openprotein/common/reduction.py‎
Lines changed: 8 additions & 0 deletions b/‎openprotein/common/reduction.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎openprotein/common/residue_contants.py‎
Lines changed: 75 additions & 0 deletions b/‎openprotein/common/residue_contants.py‎
Lines changed: 75 additions & 0 deletions
@@ -6,8 +6,8 @@
 isort:skip_file
 """
 
-from typing import TYPE_CHECKING
-import warnings
+import os
+from pathlib import Path
 
 from openprotein._version import __version__
 from openprotein.data import DataAPI
@@ -161,4 +161,48 @@ def models(self) -> "ModelsAPI":
         return self._models
 
 
-connect = OpenProtein
+def connect(
+    username: str | None = None,
+    password: str | None = None,
+    backend: str | None = None,
+    timeout: int = 180,
+) -> OpenProtein:
+    """
+    Connect and create a :py:class:`OpenProtein` session.
+
+    Parameters
+    ----------
+    username : str, optional
+        The username of the user. If not provided, taken from the
+        environment variable ``OPENPROTEIN_USERNAME`` or a
+        configuration file at ``~/.openprotein/config.toml``.
+    password : str, optional
+        The password of the user. If not provided, taken from the
+        environment variable ``OPENPROTEIN_PASSWORD`` or a
+        configuration file at ``~/.openprotein/config.toml``.
+
+    Examples
+    --------
+    >>> session = openprotein.connect("username", "password")
+    """
+
+    CREDENTIALS_FILE_PATH = Path.home() / ".openprotein/config.toml"
+    if CREDENTIALS_FILE_PATH.exists():
+        import tomli
+
+        with open(CREDENTIALS_FILE_PATH, "rb") as f:
+            file_config = tomli.load(f)
+    else:
+        file_config = {}
+    USERNAME = os.getenv("OPENPROTEIN_USERNAME", str(file_config.get("username")))
+    PASSWORD = os.getenv("OPENPROTEIN_PASSWORD", str(file_config.get("password")))
+    BACKEND = os.getenv(
+        "OPENPROTEIN_API_BACKEND",
+        file_config.get("backend", "https://api.openprotein.ai/api/"),
+    )
+    return OpenProtein(
+        username=username or USERNAME,
+        password=password or PASSWORD,
+        backend=backend or BACKEND,
+        timeout=timeout,
+    )
@@ -7,7 +7,7 @@
 from openprotein.base import APISession
 from openprotein.errors import DeprecationError
 from openprotein.jobs import Job
-from openprotein.protein import Protein
+from openprotein.molecules import Protein
 
 from . import api
 from .msa import MSAFuture
@@ -285,7 +285,7 @@ def upload_msa(self, msa_file: BinaryIO) -> MSAFuture:
             session=self.session, job=api.msa_post(self.session, msa_file=msa_file)
         )
 
-    def create_msa(self, seed: bytes) -> MSAFuture:
+    def create_msa(self, seed: bytes | str) -> MSAFuture:
         """
         Construct an MSA via homology search with the seed sequence.
 
 
@@ -1,8 +1,6 @@
-import os
 import sys
 import warnings
-from collections.abc import Container, Mapping
-from typing import Union
+from typing import Mapping, Sequence
 from urllib.parse import urljoin
 
 import requests
@@ -13,10 +11,6 @@
 import openprotein.config as config
 from openprotein.errors import APIError, AuthError, HTTPError
 
-USERNAME = os.getenv("OPENPROTEIN_USERNAME")
-PASSWORD = os.getenv("OPENPROTEIN_PASSWORD")
-BACKEND = os.getenv("OPENPROTEIN_API_BACKEND", "https://api.openprotein.ai/api/")
-
 
 class BearerAuth(requests.auth.AuthBase):
     """
@@ -34,29 +28,18 @@ def __call__(self, r):
 class APISession(requests.Session):
     """
     A class to handle API sessions. This class provides a connection session to the OpenProtein API.
-
-    Parameters
-    ----------
-    username : str
-        The username of the user.
-    password : str
-        The password of the user.
-
-    Examples
-    --------
-    >>> session = APISession("username", "password")
     """
 
     def __init__(
         self,
-        username: str | None = USERNAME,
-        password: str | None = PASSWORD,
-        backend: str = BACKEND,
+        username: str,
+        password: str,
+        backend: str,
         timeout: int = 180,
     ):
         if not username or not password:
             raise AuthError(
-                "Expected username and password. Or use environment variables `OPENPROTEIN_USERNAME` and `OPENPROTEIN_PASSWORD`"
+                "Expected username and password. Or use environment variables `OPENPROTEIN_USERNAME` and `OPENPROTEIN_PASSWORD`. Or provide these variables (`username` and `password`) in ~/.openprotein/config.toml."
             )
         super().__init__()
         self.backend = backend
@@ -132,7 +115,7 @@ def request(self, method: str, url: str, *args, **kwargs):
         response = super().request(method, full_url, *args, **kwargs)
 
         if (js := kwargs.get("json")) and js is not None:
-            if total_size(js) > 1e6:
+            if _total_size(js) > 1e6:
                 warnings.warn(
                     "The requested payload is >1MB. There might be some delays or issues in processing. If the request fails, please try again with smaller sizes."
                 )
@@ -153,7 +136,7 @@ def request(self, method: str, url: str, *args, **kwargs):
         return response
 
 
-def total_size(o, seen=None):
+def _total_size(o: Sequence | Mapping, seen=None):
     """Recursively finds size of objects including contents."""
     if seen is None:
         seen = set()
@@ -163,19 +146,27 @@ def total_size(o, seen=None):
     seen.add(obj_id)
     size = sys.getsizeof(o)
     if isinstance(o, dict):
-        size += sum((total_size(k, seen) + total_size(v, seen)) for k, v in o.items())
+        size += sum((_total_size(k, seen) + _total_size(v, seen)) for k, v in o.items())
     elif isinstance(o, (list, tuple, set, frozenset)):
-        size += sum(total_size(i, seen) for i in o)
+        size += sum(_total_size(i, seen) for i in o)
     return size
 
 
-class RestEndpoint:
-    pass
-
-
 class TimeoutError(requests.exceptions.HTTPError):
-    pass
+    """
+    An Exception raised due to timeout, possibly from overly large
+    requests.
+    """
 
 
 class CloudFrontError(requests.exceptions.HTTPError):
-    pass
+    """
+    An Exception raised due to CloudFront.
+
+    This is usually due to the strict timeout from CloudFront.
+    AWS CloudFront limits responses to return within 2 minutes.
+    This can be a bit prohibitive for our system that tends to
+    deal with large data. It is usually safe to just ignore/retry upon
+    hitting this error. Our system will scale up and still handle
+    the job.
+    """
@@ -1,88 +1,12 @@
-"""Additional chains that can be used with OpenProtein."""
+import warnings
 
-from dataclasses import dataclass
+warnings.warn(
+    "openprotein.chains is deprecated and will be removed in v0.11. "
+    "Use `from openprotein.molecules import DNA, RNA, Ligand` instead.",
+    FutureWarning,
+    stacklevel=2,
+)
 
+from openprotein.molecules import DNA, RNA, Ligand
 
-@dataclass
-class DNA:
-    """
-    Represents a DNA sequence.
-
-    Attributes:
-        sequence (str): The nucleotide sequence of the DNA.
-    """
-
-    sequence: str
-    chain_id: str | list[str] | None = None
-    cyclic: bool = False
-
-    def __init__(
-        self,
-        sequence: str,
-        chain_id: str | list[str] | None = None,
-        cyclic: bool = False,
-    ):
-        # validate the sequence matches DNA
-        if not all(nt in set("ACGT") for nt in sequence.upper()):
-            raise ValueError("Sequence contains invalid DNA nucleotides.")
-        self.sequence = sequence
-        self.chain_id = chain_id
-        self.cyclic = cyclic
-
-
-@dataclass
-class RNA:
-    """
-    Represents an RNA sequence.
-
-    Attributes:
-        sequence (str): The nucleotide sequence of the RNA.
-    """
-
-    sequence: str
-    chain_id: str | list[str] | None = None
-    cyclic: bool = False
-
-    def __init__(
-        self,
-        sequence: str,
-        chain_id: str | list[str] | None = None,
-        cyclic: bool = False,
-    ):
-        # validate the sequence matches RNA
-        if not all(nt in set("ACGU") for nt in sequence.upper()):
-            raise ValueError("Sequence contains invalid RNA nucleotides.")
-        self.sequence = sequence
-        self.chain_id = chain_id
-        self.cyclic = cyclic
-
-
-@dataclass
-class Ligand:
-    """
-    Represents a ligand with optional Chemical Component Dictionary (CCD) identifier and SMILES string.
-
-    Requires either a CCD identifier or SMILES string.
-
-    Attributes:
-        ccd (str | None): The CCD identifier for the ligand.
-        smiles (str | None): The SMILES representation of the ligand.
-    """
-
-    chain_id: str | list[str] | None = None
-    ccd: str | None = None
-    smiles: str | None = None
-
-    def __init__(
-        self,
-        *,
-        chain_id: str | list[str] | None = None,
-        ccd: str | None = None,
-        smiles: str | None = None,
-    ):
-        self.chain_id = chain_id
-        if (ccd is None and smiles is None) or (ccd is not None and smiles is not None):
-            raise ValueError("Exactly one of 'ccd' or 'smiles' must be provided.")
-        # TODO add validation
-        self.ccd = ccd
-        self.smiles = smiles
+__all__ = ["DNA", "RNA", "Ligand"]
@@ -26,8 +26,8 @@ class ModelMetadata(BaseModel):
     id: str = Field(..., alias="model_id")
     description: ModelDescription
     max_sequence_length: int | None = None
-    dimension: int
-    output_types: list[str]
-    input_tokens: list[str] | None
+    dimension: int | None = None
+    output_types: list[str] | None = None
+    input_tokens: list[str] | None = None
     output_tokens: list[str] | None = None
-    token_descriptions: list[list[TokenInfo]]
+    token_descriptions: list[list[TokenInfo]] | None = None
@@ -5,6 +5,14 @@
 
 
 class ReductionType(str, Enum):
+    """
+    ReductionType is an enumeration of the possible reduction types available.
+
+    Attributes:
+        MEAN : Mean reduction takes the mean of the embeddings across the sequence length dimension.
+        SUM : Sum reduction takes the sum of the embeddings across the sequence length dimension.
+    """
+
     MEAN = "MEAN"
     SUM = "SUM"
 
 
@@ -0,0 +1,75 @@
+# fmt: off
+residue_atoms = {
+    "ALA": ["C", "CA", "CB", "N", "O"],
+    "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"],
+    "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"],
+    "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"],
+    "CYS": ["C", "CA", "CB", "N", "O", "SG"],
+    "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"],
+    "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"],
+    "GLY": ["C", "CA", "N", "O"],
+    "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"],
+    "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"],
+    "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"],
+    "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"],
+    "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"],
+    "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"],
+    "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"],
+    "SER": ["C", "CA", "CB", "N", "O", "OG"],
+    "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"],
+    "TRP": [
+        "C",
+        "CA",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE2",
+        "CE3",
+        "CZ2",
+        "CZ3",
+        "CH2",
+        "N",
+        "NE1",
+        "O",
+    ],
+    "TYR": [
+        "C",
+        "CA",
+        "CB",
+        "CG",
+        "CD1",
+        "CD2",
+        "CE1",
+        "CE2",
+        "CZ",
+        "N",
+        "O",
+        "OH",
+    ],
+    "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"],
+}
+
+restype_1to3 = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+}
+# fmt: on