Source code for renku.core.dataset.dataset_add

# Copyright Swiss Data Science Center (SDSC). A partnership between
# École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset add business logic."""

import itertools
import os.path
import shutil
import tempfile
from pathlib import Path
from typing import Dict, Generator, List, Optional, Tuple, Union, cast

from renku.command.command_builder.command import inject
from renku.core import errors
from renku.core.dataset.context import DatasetContext
from renku.core.dataset.datasets_provenance import DatasetsProvenance
from renku.core.dataset.pointer_file import create_external_file
from renku.core.dataset.providers.api import ImporterApi
from renku.core.dataset.providers.factory import ProviderFactory
from renku.core.dataset.providers.local import LocalProvider
from renku.core.dataset.providers.models import DatasetAddAction, DatasetAddMetadata
from renku.core.interface.dataset_gateway import IDatasetGateway
from renku.core.interface.storage import IStorage
from renku.core.storage import check_external_storage, track_paths_in_storage
from renku.core.util import communication, requests
from renku.core.util.git import get_git_user
from renku.core.util.os import get_absolute_path, get_file_size, get_files, get_relative_path, hash_file, is_subpath
from renku.core.util.urls import check_url, is_uri_subfolder, resolve_uri
from renku.core.util.util import parallel_execute
from renku.domain_model.constant import NON_EXISTING_ENTITY_CHECKSUM
from renku.domain_model.dataset import Dataset, DatasetFile, RemoteEntity
from renku.domain_model.project_context import project_context


[docs]def add_to_dataset(
    dataset_name: str,
    urls: List[str],
    *,
    importer: Optional[ImporterApi] = None,
    force: bool = False,
    create: bool = False,
    overwrite: bool = False,
    sources: Optional[List[Union[str, Path]]] = None,
    destination: str = "",
    revision: Optional[str] = None,
    extract: bool = False,
    clear_files_before: bool = False,
    total_size: Optional[int] = None,
    datadir: Optional[Path] = None,
    storage: Optional[str] = None,
    **kwargs,
) -> Dataset:
    """Import the data into the data directory."""
    sources = sources or []

    if not create and storage:
        raise errors.ParameterError("Storage can be set only when creating a dataset")

    try:
        with DatasetContext(name=dataset_name, create=create, datadir=datadir, storage=storage) as dataset:
            destination_path = _create_destination_directory(dataset, destination)

            check_external_storage()

            # NOTE: This is not required for cloud storages
            if not dataset.storage:
                _check_available_space(urls, total_size=total_size)

            datadir = cast(Path, project_context.path / dataset.get_datadir())
            if create and datadir.exists() and not dataset.storage:
                # NOTE: Add datadir to paths to add missing files on create
                for file in get_files(datadir):
                    urls.append(str(file))

            files = get_files_metadata(
                urls=urls,
                dataset=dataset,
                importer=importer,
                destination=destination_path,
                revision=revision,
                sources=sources,
                extract=extract,
                force=force,
                **kwargs,
            )

            files = filter_files(dataset=dataset, files=files, force=force, overwrite=overwrite)
            if not files:
                if create:
                    raise errors.UsageError("There are no files to create a dataset")
                else:
                    communication.warn("No new file was added to project")
                    return dataset

            # NOTE: All files at this point can be force-added

            copy_files_to_dataset(dataset=dataset, files=files)
            add_files_to_repository(dataset=dataset, files=files)
            update_dataset_metadata(dataset=dataset, files=files, clear_files_before=clear_files_before)

            DatasetsProvenance().add_or_update(dataset, creator=get_git_user(repository=project_context.repository))
    except errors.DatasetNotFound:
        raise errors.DatasetNotFound(
            message="Dataset '{0}' does not exist.\n"
            "Use 'renku dataset create {0}' to create the dataset or retry 'renku dataset add {0}' command "
            "with '--create' option for automatic dataset creation.".format(dataset_name)
        )
    except (FileNotFoundError, errors.GitCommandError) as e:
        raise errors.ParameterError("Could not find paths/URLs: \n{}".format("\n".join(urls))) from e
    else:
        project_context.database.commit()
        return dataset


[docs]def get_files_metadata(
    *,
    urls: List[str],
    importer: Optional[ImporterApi] = None,
    dataset: Dataset,
    destination: Path,
    extract: bool,
    revision: Optional[str],
    sources: List[Union[str, Path]],
    force: bool = False,
    **kwargs,
) -> List[DatasetAddMetadata]:
    """Process file URLs for adding to a dataset."""
    if importer:
        return importer.download_files(destination=destination, extract=extract)

    if len(urls) == 0:
        raise errors.ParameterError("No URL is specified")
    if sources and len(urls) > 1:
        raise errors.ParameterError("Cannot use '--source' with multiple URLs.")

    files = []

    for url in urls:
        is_remote, is_git = check_url(url)

        if not is_git and sources:
            raise errors.ParameterError("Cannot use '-s/--src/--source' with URLs or local files.")

        dataset_add_action = DatasetAddAction.NONE

        if is_remote:
            provider = ProviderFactory.get_add_provider(uri=url)
        else:
            # NOTE: If URI is in the local file system, check to see if it's part of a mounted dataset/provider
            cloud_dataset, remote_url = get_cloud_dataset_from_path(path=url)
            if cloud_dataset:
                url = remote_url
                provider = ProviderFactory.get_storage_provider(uri=cloud_dataset.storage)
                # NOTE: Update metadata if destination dataset is the same as source dataset, otherwise copy the file
                # since it's already in the local filesystem
                dataset_add_action = DatasetAddAction.COPY
            else:
                provider = LocalProvider(uri=url)

        new_files = provider.get_metadata(
            uri=url,
            destination=destination,
            revision=revision,
            sources=sources,
            dataset=dataset,
            extract=extract,
            force=force,
            dataset_add_action=dataset_add_action,
            **kwargs,
        )

        files.extend(new_files)

    return files


[docs]@inject.autoparams("dataset_gateway")
def has_cloud_storage(dataset_gateway: IDatasetGateway) -> bool:
    """Return if a project has any dataset with cloud storage with its data directory mounted or pulled."""
    # NOTE: ``exists`` return False for symlinks if their target doesn't exists, but it's fine here since it means the
    # dataset's mounted/pulled location doesn't exist.
    return any(
        dataset
        for dataset in dataset_gateway.get_all_active_datasets()
        if dataset.storage and (project_context.path / dataset.get_datadir()).exists()
    )


[docs]@inject.autoparams("dataset_gateway")
def get_cloud_dataset_from_path(
    path: Union[Path, str], dataset_gateway: IDatasetGateway, missing_ok: bool = False
) -> Tuple[Optional[Dataset], Optional[str]]:
    """Check the path against datasets' storage and return a dataset (if any)."""
    if not has_cloud_storage():
        return None, None

    # NOTE: If path is inside the datadir of a dataset with storage backend and the dataset isn't mounted, we should
    # still add whatever is in the path (because it might have been pulled)

    path = Path(get_absolute_path(path))

    if not missing_ok and not path.exists() and not os.path.lexists(path):
        return None, None

    for dataset in dataset_gateway.get_all_active_datasets():
        if not dataset.storage:
            continue

        datadir = project_context.path / dataset.get_datadir()
        resolved_path = path.resolve()

        # NOTE: Resolve ``path`` because ``datadir`` is resolved and resolved paths might have be on a different
        # location (e.g. on macos /tmp resolves to /private/tmp)
        resolved_relative_path = get_relative_path(resolved_path, base=datadir.resolve())

        if is_subpath(path, base=datadir) or resolved_relative_path is not None:
            if resolved_relative_path == ".":
                resolved_relative_path = ""
            storage_uri = dataset.storage.rstrip("/")
            remote_url = f"{storage_uri}/{resolved_relative_path}"
            return dataset, remote_url
        elif is_subpath(resolved_path, Path(dataset.storage).resolve()):  # NOTE: For local backend storage
            return dataset, str(resolved_path)

    return None, None


def _check_available_space(urls: List[str], total_size: Optional[int] = None):
    """Check that there is enough space available on the device for download."""
    if total_size is None:
        total_size = 0
        for url in urls:
            is_remote, _ = check_url(url)
            if not is_remote:
                continue

            try:
                response = requests.head(url, allow_redirects=True)
                total_size += int(response.headers.get("content-length", 0))
            except errors.RequestError:
                pass
    usage = shutil.disk_usage(project_context.path)

    if total_size > usage.free:
        mb = 2**20
        message = "Insufficient disk space (required: {:.2f} MB" "/available: {:.2f} MB). ".format(
            total_size / mb, usage.free / mb
        )
        raise errors.OperationError(message)


def _create_destination_directory(dataset: Dataset, destination: Optional[Union[Path, str]] = None) -> Path:
    """Create directory for dataset add."""
    dataset_datadir = project_context.path / dataset.get_datadir()

    # NOTE: Make sure that dataset's data dir exists because we check for existence of a destination later to decide
    # what will be its name
    dataset_datadir.mkdir(parents=True, exist_ok=True)

    destination = destination or ""
    relative_path = cast(str, get_relative_path(destination, base=dataset_datadir, strict=True))
    return dataset_datadir / relative_path


[docs]def filter_files(
    dataset: Dataset, files: List[DatasetAddMetadata], force: bool, overwrite: bool
) -> List[DatasetAddMetadata]:
    """Filter ignored and overwritten files."""

    def remove_git_files(files_to_filter: List[DatasetAddMetadata]):
        """Remove all files that are under a .git directory."""
        git_paths = [f.entity_path for f in files_to_filter if str(f.entity_path).startswith(".git")]
        if not git_paths:
            return files_to_filter

        communication.warn("Ignored adding paths under a .git directory:\n\t" + "\n\t".join(str(p) for p in git_paths))
        return [f for f in files_to_filter if f.entity_path not in git_paths]

    def check_ignored_files(files_to_filter: Generator[DatasetAddMetadata, None, None]):
        """Check if any files added were ignored."""
        paths = {f.get_absolute_commit_path(project_context.path): f for f in files_to_filter}

        ignored_paths = project_context.repository.get_ignored_paths(*paths)
        if ignored_paths:
            ignored_sources = [file.source for path, file in paths.items() if path in ignored_paths]

            communication.warn(
                "Theses paths are ignored by one of your .gitignore files (use '--force' flag if you really want to "
                "add them):\n\t" + "\n\t".join([str(p) for p in ignored_sources])
            )

        return (file for path, file in paths.items() if path not in ignored_paths)

    def check_existing_files(files_to_filter: Generator[DatasetAddMetadata, None, None]):
        """Check if files added already exist."""
        files_list = list(files_to_filter)
        existing_paths = [f.entity_path for f in files_list if dataset.find_file(f.entity_path)]
        if existing_paths:
            communication.warn(
                "These existing files were not overwritten (use '--overwrite' flag to overwrite them):\n\t"
                + "\n\t".join([str(p) for p in existing_paths])
            )

        return (f for f in files_list if f.entity_path not in existing_paths)

    files = remove_git_files(files)

    # NOTE: Don't filter ignored or existing files that will be added to a remote storage
    remote_files = (f for f in files if f.metadata_only)
    local_files = (f for f in files if not f.metadata_only)

    # NOTE: Data directory of datasets with a storage backend is always ignored, so, filtering files is meaningless
    if not force and not dataset.storage:
        local_files = check_ignored_files(local_files)

    if not overwrite:
        local_files = check_existing_files(local_files)

    files = list(itertools.chain(local_files, remote_files))

    return files


[docs]def get_dataset_file_path_within_dataset(dataset: Dataset, entity_path: Union[Path, str]) -> Path:
    """Return a dataset file's path relative to the dataset's datadir.

    NOTE: Dataset files can reside outside the dataset's datadir.
    """
    assert not os.path.isabs(entity_path), f"Entity path cannot be absolute: {entity_path}"

    entity_path = Path(entity_path)

    try:
        return entity_path.relative_to(dataset.get_datadir())
    except ValueError:
        return entity_path


[docs]def get_upload_uri(dataset: Dataset, entity_path: Union[Path, str]) -> str:
    """Return the remote storage path that a dataset file would be located.

    Args:
        dataset(Dataset): Dataset with a backend storage.
        entity_path(Union[Path, str]): Dataset file's path (entity path); it is relative to the project's root.

    Returns:
        str: URI within remote storage.
    """
    assert dataset.storage, "Cannot get URI for datasets with no backend storage"

    base = dataset.storage.rstrip("/")
    path_within_dataset = get_dataset_file_path_within_dataset(dataset=dataset, entity_path=entity_path)

    return f"{base}/{path_within_dataset}"


[docs]def copy_file(file: DatasetAddMetadata, dataset: Dataset, storage: Optional[IStorage]) -> List[Optional[Path]]:
    """Copy/move/link a file to dataset's data directory."""
    if not file.has_action:
        return []

    # NOTE: If file is in a subdirectory of a dataset's remote storage URI, only update the metadata
    if file.from_cloud_storage:
        if dataset.storage and is_uri_subfolder(resolve_uri(dataset.storage), file.url):
            file.action = DatasetAddAction.METADATA_ONLY
        else:
            file.action = DatasetAddAction.DOWNLOAD

    if file.action in (
        DatasetAddAction.COPY,
        DatasetAddAction.MOVE,
        DatasetAddAction.SYMLINK,
        DatasetAddAction.DOWNLOAD,
    ):
        try:
            file.destination.parent.mkdir(parents=True, exist_ok=True)
        except OSError as e:
            raise errors.InvalidFileOperation(f"Cannot create destination '{file.destination.parent}': {e}")

    file_to_upload: Union[Path, str] = file.source.resolve()
    delete_source = False
    track_in_lfs = True

    try:
        if file.action == DatasetAddAction.DOWNLOAD:
            # NOTE: Download to a temporary location if dataset has a cloud storage because it's usually mounted as
            # read-only and download would fail. It's ok not to move it to dataset's data dir since it'll be uploaded.
            dst: Union[Path, str]
            if storage:
                fd, dst = tempfile.mkstemp()
                os.close(fd)
            else:
                dst = file.destination

            assert file.provider, f"Storage provider isn't set for {file} with DOWNLOAD action"
            download_storage = file.provider.get_storage()
            download_storage.download(file.url, dst)
            file_to_upload = dst
        elif file.action == DatasetAddAction.COPY:
            shutil.copy(file.source, file.destination)
        elif file.action == DatasetAddAction.MOVE:
            # NOTE: Set ``delete_source`` in case move fails due to a dataset's read-only mounted data directory
            delete_source = True
            shutil.move(file.source, file.destination, copy_function=shutil.copy)  # type: ignore
            delete_source = False
            file_to_upload = file.destination
        elif file.action == DatasetAddAction.SYMLINK:
            create_external_file(target=file.source, path=file.destination)
            # NOTE: Don't track symlinks to external files in LFS
            track_in_lfs = False
        elif file.metadata_only:
            # NOTE: Nothing to do when adding file to a dataset with a parent remote storage
            pass
        else:
            raise errors.OperationError(f"Invalid action {file.action}")
    except OSError as e:
        # NOTE: It's ok if copying data to a read-only mounted cloud storage fails
        if "Read-only file system" in str(e) and storage:
            pass
        else:
            dst = get_relative_path(file.destination, project_context.path) or file.destination
            raise errors.InvalidFileOperation(f"Cannot copy/move '{dst}': {e}")

    if file.size is None:
        file.size = get_file_size(file_to_upload)

    if storage:
        # NOTE: Don't track files in a dataset with cloud storage in LFS
        track_in_lfs = False

        if file.metadata_only:
            assert file.based_on, f"wasBasedOn isn't set for {file} with METADATA_ONLY action"
            file_uri = file.based_on.url
            md5_hash: Optional[str] = file.based_on.checksum
        else:
            file_uri = get_upload_uri(dataset=dataset, entity_path=file.entity_path)
            md5_hash = hash_file(file_to_upload, hash_type="md5")

            # NOTE: If dataset has a storage backend, upload the file to the remote storage.
            storage.upload(source=file_to_upload, uri=file_uri)

        file.based_on = RemoteEntity(url=file_uri, path=file.entity_path, checksum=md5_hash)

    if delete_source:
        file.source.unlink(missing_ok=True)

    return [file.destination] if track_in_lfs else []


[docs]def copy_files_to_dataset(dataset: Dataset, files: List[DatasetAddMetadata]):
    """Copy/Move files into a dataset's directory."""

    dataset_storage = None
    if dataset.storage:
        provider = ProviderFactory.get_storage_provider(uri=dataset.storage)
        dataset_storage = provider.get_storage()

    lfs_files = parallel_execute(copy_file, files, rate=5, dataset=dataset, storage=dataset_storage)

    if lfs_files and not dataset.storage:
        track_paths_in_storage(*lfs_files)


[docs]def add_files_to_repository(dataset: Dataset, files: List[DatasetAddMetadata]):
    """Track files in project's repository."""
    # NOTE: There is nothing to track for remote storages
    if dataset.storage:
        return

    # NOTE: Don't commit files that will be uploaded to a remote storage
    paths_to_commit = [f.get_absolute_commit_path(project_context.path) for f in files]

    repository = project_context.repository

    # Force-add to include possible ignored files
    if len(paths_to_commit) > 0:
        repository.add(*paths_to_commit, project_context.pointers_path, force=True)

    n_staged_changes = len(repository.staged_changes)
    if n_staged_changes == 0:
        communication.warn("No new file was added to project")


[docs]def update_dataset_metadata(dataset: Dataset, files: List[DatasetAddMetadata], clear_files_before: bool):
    """Add newly-added files to the dataset's metadata."""
    # NOTE: For datasets with cloud storage backend, we use MD5 hash as checksum instead of git hash.
    if dataset.storage:
        checksums: Dict[Union[Path, str], Optional[str]] = {
            f.entity_path: f.based_on.checksum for f in files if f.based_on
        }
    else:
        repo_paths: List[Union[Path, str]] = [
            file.entity_path for file in files if (project_context.path / file.entity_path).exists()
        ]
        checksums = project_context.repository.get_object_hashes(repo_paths)

    dataset_files = []

    for file in files:
        dataset_file = DatasetFile.from_path(
            path=file.entity_path,
            source=file.url,
            based_on=file.based_on,
            size=file.size,
            checksum=checksums.get(file.entity_path) or NON_EXISTING_ENTITY_CHECKSUM,
        )
        dataset_files.append(dataset_file)

    if clear_files_before:
        dataset.clear_files()

    dataset.add_or_update_files(dataset_files)