diff --git a/src/dolphin/_types.py b/src/dolphin/_types.py index 6cbe2596..f8dd08a2 100644 --- a/src/dolphin/_types.py +++ b/src/dolphin/_types.py @@ -4,7 +4,14 @@ import sys from enum import Enum from os import PathLike -from typing import TYPE_CHECKING, NamedTuple, TypeVar, Union +from typing import ( + TYPE_CHECKING, + NamedTuple, + Protocol, + TypeVar, + Union, + runtime_checkable, +) if sys.version_info >= (3, 10): from typing import ParamSpec @@ -22,12 +29,6 @@ PathLikeStr = PathLike -PathOrStr = Union[str, PathLikeStr] -Filename = PathOrStr # May add a deprecation notice for `Filename` -# TypeVar added for generic functions which should return the same type as the input -PathLikeT = TypeVar("PathLikeT", str, PathLikeStr) - - class Bbox(NamedTuple): """Bounding box named tuple, defining extent in cartesian coordinates. @@ -104,3 +105,31 @@ class TropoType(str, Enum): """Hydrostatic (same as dry, named differently in raider)""" COMB = "comb" """Combined wet + dry delay.""" + + +@runtime_checkable +class GeneralPath(Protocol): + """A protocol to handle paths that can be either local or S3 paths.""" + + def parent(self): ... + + def suffix(self): ... + + def resolve(self): ... + + def exists(self): ... + + def read_text(self): ... + + def __truediv__(self, other): ... + + def __str__(self) -> str: ... + + def __fspath__(self) -> str: + return str(self) + + +PathOrStr = Union[str, PathLikeStr, GeneralPath] +Filename = PathOrStr # May add a deprecation notice for `Filename` +# TypeVar added for generic functions which should return the same type as the input +PathLikeT = TypeVar("PathLikeT", str, PathLikeStr, GeneralPath) diff --git a/src/dolphin/io/__init__.py b/src/dolphin/io/__init__.py index 3f6ffa4f..556c14a0 100644 --- a/src/dolphin/io/__init__.py +++ b/src/dolphin/io/__init__.py @@ -1,6 +1,7 @@ from ._background import * from ._blocks import * from ._core import * +from ._paths import * from ._process import * from ._readers import * from ._writers import * diff --git a/src/dolphin/io/_paths.py b/src/dolphin/io/_paths.py new file mode 100644 index 00000000..e9f9c064 --- /dev/null +++ b/src/dolphin/io/_paths.py @@ -0,0 +1,197 @@ +from __future__ import annotations + +import copy +import logging +import re +from pathlib import Path +from typing import Union +from urllib.parse import ParseResult, urlparse + +from dolphin._types import GeneralPath + +__all__ = ["S3Path"] + + +logger = logging.getLogger(__name__) + + +class S3Path(GeneralPath): + """A convenience class to handle paths on S3. + + This class relies on `pathlib.Path` for operations using `urllib` to parse the url. + + If passing a url with a trailing slash, the slash will be preserved + when converting back to string. + + Note that pure path manipulation functions do *not* require `boto3`, + but functions which interact with S3 (e.g. `exists()`, `.read_text()`) do. + + Attributes + ---------- + bucket : str + Name of bucket in the url + path : pathlib.Path + The URL path after s3:/// + key : str + Alias of `path` converted to a string + + Examples + -------- + >>> from dolphin.io import S3Path + >>> s3_path = S3Path("s3://bucket/path/to/file.txt") + >>> str(s3_path) + 's3://bucket/path/to/file.txt' + >>> s3_path.parent + S3Path("s3://bucket/path/to/") + >>> str(s3_path.parent) + 's3://bucket/path/to/' + + """ + + def __init__(self, s3_url: Union[str, "S3Path"], unsigned: bool = False): + """Create an S3Path. + + Parameters + ---------- + s3_url : str or S3Path + The S3 url to parse. + unsigned : bool, optional + If True, disable signing requests to S3. + + """ + # Names come from the urllib.parse.ParseResult + if isinstance(s3_url, S3Path): + self._scheme: str = s3_url._scheme + self._netloc: str = s3_url._netloc + self.bucket: str = s3_url.bucket + self.path: Path = s3_url.path + self._trailing_slash: str = s3_url._trailing_slash + else: + parsed: ParseResult = urlparse(s3_url) + self._scheme = parsed.scheme + self._netloc = self.bucket = parsed.netloc + self._parsed = parsed + self.path = Path(parsed.path) + self._trailing_slash = "/" if s3_url.endswith("/") else "" + + if self._scheme != "s3": + raise ValueError(f"{s3_url} is not an S3 url") + + self._unsigned = unsigned + + @classmethod + def from_bucket_key(cls, bucket: str, key: str): + """Create a `S3Path` from the bucket name and key/prefix. + + Matches API of some Boto3 functions which use this format. + + Parameters + ---------- + bucket : str + Name of S3 bucket. + key : str + S3 url of path after the bucket. + + """ + return cls(f"s3://{bucket}/{key}") + + def get_path(self): + # For S3 paths, we need to add the double slash and netloc back to the front + return f"{self._scheme}://{self._netloc}{self.path.as_posix()}{self._trailing_slash}" + + @property + def key(self) -> str: + """Name of key/prefix within the bucket with leading slash removed.""" + return f"{str(self.path.as_posix()).lstrip('/')}{self._trailing_slash}" + + @property + def parent(self): + parent_path = self.path.parent + # Since this is a parent, it will will always end in a slash + if self._scheme == "s3": + # For S3 paths, we need to add the scheme and netloc back to the front + return S3Path(f"{self._scheme}://{self._netloc}{parent_path.as_posix()}/") + else: + # For local paths, we can just convert the path to a string + return S3Path(str(parent_path) + "/") + + @property + def suffix(self): + return self.path.suffix + + def resolve(self) -> S3Path: + """Resolve the path to an absolute path- S3 paths are always absolute.""" + return self + + def _get_client(self): + import boto3 + from botocore import UNSIGNED + from botocore.config import Config + + if self._unsigned: + return boto3.client("s3", config=Config(signature_version=UNSIGNED)) + else: + return boto3.client("s3") + + def exists(self) -> bool: + """Whether this path exists on S3.""" + client = self._get_client() + resp = client.list_objects_v2( + Bucket=self.bucket, + Prefix=self.key, + MaxKeys=1, + ) + return resp.get("KeyCount") == 1 + + def read_text(self) -> str: + """Download/read the S3 file as text.""" + return self._download_as_bytes().decode() + + def read_bytes(self) -> bytes: + """Download/read the S3 file as bytes.""" + return self._download_as_bytes() + + def _download_as_bytes(self) -> bytes: + """Download file to a `BytesIO` buffer to read as bytes.""" + from io import BytesIO + + client = self._get_client() + + bio = BytesIO() + client.download_fileobj(self.bucket, self.key, bio) + bio.seek(0) + out = bio.read() + bio.close() + return out + + def __truediv__(self, other): + new = copy.deepcopy(self) + new.path = self.path / other + new._trailing_slash = "/" if str(other).endswith("/") else "" + return new + + def __eq__(self, other): + if isinstance(other, S3Path): + return self.get_path() == other.get_path() + elif isinstance(other, str): + return self.get_path() == other + else: + return False + + def __repr__(self): + return f'S3Path("{self.get_path()}")' + + def __str__(self): + return self.get_path() + + def to_gdal(self): + """Convert this S3Path to a GDAL URL.""" + return f"/vsis3/{self.bucket}/{self.key}" + + +def fix_s3_url(url): + """Fix an S3 URL that has been altered by pathlib. + + Will replace s3:/my-bucket/... with s3://my-bucket/... + """ + return re.sub(r"s3:/((?!/).*)", r"s3://\1", str(url)) diff --git a/src/dolphin/io/_readers.py b/src/dolphin/io/_readers.py index 691a2aec..29fb0c49 100644 --- a/src/dolphin/io/_readers.py +++ b/src/dolphin/io/_readers.py @@ -28,6 +28,7 @@ from dolphin.io._blocks import iter_blocks from ._background import _DEFAULT_TIMEOUT, BackgroundReader +from ._paths import S3Path from ._utils import _ensure_slices, _unpack_3d_slices logger = logging.getLogger(__name__) @@ -715,7 +716,10 @@ def __init__( # files: list[Filename] = [Path(f) for f in file_list] self._use_abs_path = use_abs_path - if use_abs_path: + files: list[Filename | S3Path] + if any(str(f).startswith("s3://") for f in file_list): + files = [S3Path(str(f)) for f in file_list] + elif use_abs_path: files = [utils._resolve_gdal_path(p) for p in file_list] else: files = list(file_list) @@ -808,12 +812,18 @@ def _write(self): ds = None @property - def _gdal_file_strings(self): + def _gdal_file_strings(self) -> list[str]: """Get the GDAL-compatible paths to write to the VRT. If we're not using .h5 or .nc, this will just be the file_list as is. """ - return [io.format_nc_filename(f, self.subdataset) for f in self.file_list] + out = [] + for f in self.file_list: + if isinstance(f, S3Path): + out.append(f.to_gdal()) + else: + out.append(io.format_nc_filename(f, self.subdataset)) + return out def __fspath__(self): # Allows os.fspath() to work on the object, enabling rasterio.open() diff --git a/src/dolphin/stack.py b/src/dolphin/stack.py index c7778ee1..0f9b58fa 100755 --- a/src/dolphin/stack.py +++ b/src/dolphin/stack.py @@ -51,6 +51,12 @@ class BaseStack(BaseModel): description="Index of the SLC to use as reference during phase linking", ) + model_config = { + # For the `Filename, so it can handle the `GeneralPath` protocol` + # https://github.com/pydantic/pydantic/discussions/5767 + "arbitrary_types_allowed": True + } + @field_validator("dates", mode="before") @classmethod def _check_if_not_tuples(cls, v): @@ -190,6 +196,12 @@ class CompressedSlcInfo(BaseModel): description="Folder/location where ministack will write outputs to.", ) + model_config = { + # For the `Filename, so it can handle the `GeneralPath` protocol` + # https://github.com/pydantic/pydantic/discussions/5767 + "arbitrary_types_allowed": True + } + @field_validator("real_slc_dates", mode="before") @classmethod def _untuple_dates(cls, v): diff --git a/tests/cassettes/test_io_paths/TestS3Path.test_exists.yaml b/tests/cassettes/test_io_paths/TestS3Path.test_exists.yaml new file mode 100644 index 00000000..0f77e375 --- /dev/null +++ b/tests/cassettes/test_io_paths/TestS3Path.test_exists.yaml @@ -0,0 +1,158 @@ +interactions: +- request: + body: null + headers: + User-Agent: + - !!binary | + Qm90bzMvMS4zNC40NyBtZC9Cb3RvY29yZSMxLjM0LjQ3IHVhLzIuMCBvcy9saW51eCM2LjUuMC0z + NS1nZW5lcmljIG1kL2FyY2gjeDg2XzY0IGxhbmcvcHl0aG9uIzMuMTEuOCBtZC9weWltcGwjQ1B5 + dGhvbiBjZmcvcmV0cnktbW9kZSNsZWdhY3kgQm90b2NvcmUvMS4zNC40Nw== + amz-sdk-invocation-id: + - !!binary | + YjNhNGRmNDUtZTlhYy00NzZiLTg4MTEtOWJmYTQyY2ExMjMy + amz-sdk-request: + - !!binary | + YXR0ZW1wdD0x + method: GET + uri: https://testzarr-insar-plotting.s3.amazonaws.com/?list-type=2&prefix=unwrapped%2F20231009_20231021.unw.conncomp.tif&max-keys=1&encoding-type=url + response: + body: + string: ' + + testzarr-insar-plottingunwrapped/20231009_20231021.unw.conncomp.tif11urlfalseunwrapped/20231009_20231021.unw.conncomp.tif2024-05-24T23:52:56.000Z"adce42fcf6d326752742058b7b2aecbc"164048STANDARD' + headers: + Content-Type: + - application/xml + Date: + - Wed, 29 May 2024 02:00:38 GMT + Server: + - AmazonS3 + Transfer-Encoding: + - chunked + x-amz-bucket-region: + - us-east-1 + x-amz-id-2: + - I+1hhj54h/ima016k/ya9yYR0XJOsD2oATzFjbtf6ab4cuUJwSqJdqVK08JMk79lxSLoQkf1POo= + x-amz-request-id: + - FX7A9RE5440Z9J6B + status: + code: 200 + message: OK +- request: + body: null + headers: + User-Agent: + - !!binary | + Qm90bzMvMS4zNC40NyBtZC9Cb3RvY29yZSMxLjM0LjQ3IHVhLzIuMCBvcy9saW51eCM2LjUuMC0z + NS1nZW5lcmljIG1kL2FyY2gjeDg2XzY0IGxhbmcvcHl0aG9uIzMuMTEuOCBtZC9weWltcGwjQ1B5 + dGhvbiBjZmcvcmV0cnktbW9kZSNsZWdhY3kgQm90b2NvcmUvMS4zNC40Nw== + amz-sdk-invocation-id: + - !!binary | + MDE4Zjk0YjktNmM1ZC00YmI5LThiOTgtM2ZiMjUwYmMxMWMz + amz-sdk-request: + - !!binary | + YXR0ZW1wdD0x + method: GET + uri: https://testzarr-insar-plotting.s3.amazonaws.com/?list-type=2&prefix=unwrapped%2F20231009_20231021.unw.conncomp.tif&max-keys=1&encoding-type=url + response: + body: + string: ' + + testzarr-insar-plottingunwrapped/20231009_20231021.unw.conncomp.tif11urlfalseunwrapped/20231009_20231021.unw.conncomp.tif2024-05-24T23:52:56.000Z"adce42fcf6d326752742058b7b2aecbc"164048STANDARD' + headers: + Content-Type: + - application/xml + Date: + - Wed, 29 May 2024 02:01:08 GMT + Server: + - AmazonS3 + Transfer-Encoding: + - chunked + x-amz-bucket-region: + - us-east-1 + x-amz-id-2: + - D8PcHCLLpOSjfHH8uQALPBcY5xIisRBkqpqZCvjcs3xx9NmErpAb+Riw5xuKPgTEEueQ60spoEU= + x-amz-request-id: + - RA11A3SEKXNH6NQP + status: + code: 200 + message: OK +- request: + body: null + headers: + User-Agent: + - !!binary | + Qm90bzMvMS4zNC40NyBtZC9Cb3RvY29yZSMxLjM0LjQ3IHVhLzIuMCBvcy9saW51eCM2LjUuMC0z + NS1nZW5lcmljIG1kL2FyY2gjeDg2XzY0IGxhbmcvcHl0aG9uIzMuMTEuOCBtZC9weWltcGwjQ1B5 + dGhvbiBjZmcvcmV0cnktbW9kZSNsZWdhY3kgQm90b2NvcmUvMS4zNC40Nw== + amz-sdk-invocation-id: + - !!binary | + MDUyMGVhM2EtM2E3Yy00YTUxLWIzZmYtNjM2MDI4MzE1OWRi + amz-sdk-request: + - !!binary | + YXR0ZW1wdD0x + method: GET + uri: https://testzarr-insar-plotting.s3.amazonaws.com/?list-type=2&prefix=unwrapped%2F20231009_20231021.unw.conncomp.tif&max-keys=1&encoding-type=url + response: + body: + string: ' + + testzarr-insar-plottingunwrapped/20231009_20231021.unw.conncomp.tif11urlfalseunwrapped/20231009_20231021.unw.conncomp.tif2024-05-24T23:52:56.000Z"adce42fcf6d326752742058b7b2aecbc"164048STANDARD' + headers: + Content-Type: + - application/xml + Date: + - Wed, 29 May 2024 02:01:39 GMT + Server: + - AmazonS3 + Transfer-Encoding: + - chunked + x-amz-bucket-region: + - us-east-1 + x-amz-id-2: + - K3NHxhYBRIPobaDhovrJhaNJoqIaRbq/EApi+uXirn5Zn4m/wHExmQkqAWZZnb1zrpLDEcUQ5epvdGaglaZ37w== + x-amz-request-id: + - 1FJWNPT4ZKHXBRQH + status: + code: 200 + message: OK +- request: + body: null + headers: + User-Agent: + - !!binary | + Qm90bzMvMS4zNC40NyBtZC9Cb3RvY29yZSMxLjM0LjQ3IHVhLzIuMCBvcy9saW51eCM2LjUuMC0z + NS1nZW5lcmljIG1kL2FyY2gjeDg2XzY0IGxhbmcvcHl0aG9uIzMuMTEuOCBtZC9weWltcGwjQ1B5 + dGhvbiBjZmcvcmV0cnktbW9kZSNsZWdhY3kgQm90b2NvcmUvMS4zNC40Nw== + amz-sdk-invocation-id: + - !!binary | + OGMzOGFhYmMtODBhZC00ZDJmLWEyOGYtYzkxODM5ZGI2MmE5 + amz-sdk-request: + - !!binary | + YXR0ZW1wdD0x + method: GET + uri: https://testzarr-insar-plotting.s3.amazonaws.com/?list-type=2&prefix=unwrapped%2F20231009_20231021.unw.conncomp.tif&max-keys=1&encoding-type=url + response: + body: + string: ' + + testzarr-insar-plottingunwrapped/20231009_20231021.unw.conncomp.tif11urlfalseunwrapped/20231009_20231021.unw.conncomp.tif2024-05-24T23:52:56.000Z"adce42fcf6d326752742058b7b2aecbc"164048STANDARD' + headers: + Content-Type: + - application/xml + Date: + - Wed, 29 May 2024 02:02:15 GMT + Server: + - AmazonS3 + Transfer-Encoding: + - chunked + x-amz-bucket-region: + - us-east-1 + x-amz-id-2: + - Q/TvGo71s143RJF5Cx9AU7zliXZ99yAPL1xxvnwNGJpmD4hwqqPtuNvSKEgeu94dXsUd6txQKTs= + x-amz-request-id: + - 3ENV69283YS24NR7 + status: + code: 200 + message: OK +version: 1 diff --git a/tests/requirements.txt b/tests/requirements.txt index 6874a625..3a15c86f 100644 --- a/tests/requirements.txt +++ b/tests/requirements.txt @@ -1,10 +1,12 @@ asv black +boto3 flake8 pooch pre-commit pytest pytest-cov pytest-randomly # control random seed +pytest-vcr # record and replay HTTP requests pytest-xdist # parallel tests: https://pytest-xdist.readthedocs.io/en/latest/ shapely diff --git a/tests/test_io_paths.py b/tests/test_io_paths.py new file mode 100644 index 00000000..74ea87c1 --- /dev/null +++ b/tests/test_io_paths.py @@ -0,0 +1,49 @@ +import pytest + +from dolphin import io + +URL = "s3://testzarr-insar-plotting/unwrapped/20231009_20231021.unw.conncomp.tif" + + +class TestS3Path: + @pytest.fixture + def s3path(self): + return io.S3Path(URL, unsigned=True) + + @pytest.mark.vcr + def test_exists(self, s3path): + assert s3path.exists() + + def test_s3path_parent(self, s3path): + assert s3path.parent == io.S3Path("s3://testzarr-insar-plotting/unwrapped/") + + def test_s3path_suffix(self, s3path): + assert s3path.suffix == ".tif" + + def test_from_bucket_key(self): + bucket = "testzarr-insar-plotting" + key = "unwrapped/20231009_20231021.unw.conncomp.tif" + s3path = io.S3Path.from_bucket_key(bucket, key) + assert str(s3path) == str(io.S3Path(URL)) + + @pytest.mark.vcr + def test_rasterio_open(self, s3path): + import rasterio as rio + + with rio.Env(AWS_NO_SIGN_REQUEST="YES"): + with rio.open(URL) as src: + driver, shape = src.driver, src.shape + + with rio.open(s3path) as src: + assert src.driver == driver + assert src.shape == shape + + def test_to_gdal_string(self, s3path): + assert s3path.to_gdal() == f"/vsis3/{URL[5:]}" + + @pytest.mark.vcr + def test_io_read_gdal(self, s3path): + from osgeo import gdal + + with gdal.config_option("AWS_NO_SIGN_REQUEST", "YES"): + assert io.get_raster_driver(s3path.to_gdal()) == "GTiff"