Source code for renku.core.dataset.dataset_add

# -*- coding: utf-8 -*-
#
# Copyright 2017-2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Dataset add business logic."""

import itertools
import os.path
import shutil
from pathlib import Path
from typing import Generator, List, Optional, Union, cast

from renku.core import errors
from renku.core.dataset.context import DatasetContext
from renku.core.dataset.datasets_provenance import DatasetsProvenance
from renku.core.dataset.pointer_file import create_external_file
from renku.core.dataset.providers.api import ImporterApi
from renku.core.dataset.providers.factory import ProviderFactory
from renku.core.dataset.providers.models import DatasetAddAction, DatasetAddMetadata
from renku.core.interface.storage import IStorage
from renku.core.storage import check_external_storage, track_paths_in_storage
from renku.core.util import communication, requests
from renku.core.util.dataset import check_url
from renku.core.util.git import get_git_user
from renku.core.util.os import delete_dataset_file, get_files, get_relative_path, hash_file
from renku.core.util.urls import is_uri_subfolder
from renku.domain_model.dataset import Dataset, DatasetFile, RemoteEntity
from renku.domain_model.project_context import project_context


[docs]def add_to_dataset(
    dataset_name: str,
    urls: List[str],
    *,
    importer: Optional[ImporterApi] = None,
    force: bool = False,
    create: bool = False,
    overwrite: bool = False,
    sources: Optional[List[Union[str, Path]]] = None,
    destination: str = "",
    revision: Optional[str] = None,
    extract: bool = False,
    clear_files_before: bool = False,
    total_size: Optional[int] = None,
    datadir: Optional[Path] = None,
    storage: Optional[str] = None,
    **kwargs,
) -> Dataset:
    """Import the data into the data directory."""
    sources = sources or []

    if not create and storage:
        raise errors.ParameterError("Storage can be set only when creating a dataset")

    try:
        with DatasetContext(name=dataset_name, create=create, datadir=datadir, storage=storage) as dataset:
            destination_path = _create_destination_directory(dataset, destination)

            check_external_storage()

            # NOTE: This is not required for external storages
            if not dataset.storage:
                _check_available_space(urls, total_size=total_size)

            datadir = cast(Path, project_context.path / dataset.get_datadir())
            if create and datadir.exists() and not dataset.storage:
                # NOTE: Add datadir to paths to add missing files on create
                for file in get_files(datadir):
                    urls.append(str(file))

            files = _get_files_metadata(
                urls=urls,
                dataset=dataset,
                importer=importer,
                destination=destination_path,
                revision=revision,
                sources=sources,
                extract=extract,
                force=force,
                **kwargs,
            )

            files = filter_files(dataset=dataset, files=files, force=force, overwrite=overwrite)
            if not files:
                if create:
                    raise errors.UsageError("There are no files to create a dataset")
                else:
                    communication.warn("No new file was added to project")
                    return dataset

            # NOTE: All files at this point can be force-added

            move_files_to_dataset(dataset=dataset, files=files)
            add_files_to_repository(dataset=dataset, files=files)
            update_dataset_metadata(dataset=dataset, files=files, clear_files_before=clear_files_before)

            DatasetsProvenance().add_or_update(dataset, creator=get_git_user(repository=project_context.repository))
    except errors.DatasetNotFound:
        raise errors.DatasetNotFound(
            message="Dataset '{0}' does not exist.\n"
            "Use 'renku dataset create {0}' to create the dataset or retry 'renku dataset add {0}' command "
            "with '--create' option for automatic dataset creation.".format(dataset_name)
        )
    except (FileNotFoundError, errors.GitCommandError) as e:
        raise errors.ParameterError("Could not find paths/URLs: \n{0}".format("\n".join(urls))) from e
    else:
        project_context.database.commit()
        return dataset


def _get_files_metadata(
    *,
    urls: List[str],
    importer: Optional[ImporterApi] = None,
    dataset: Dataset,
    destination: Path,
    extract: bool,
    revision: Optional[str],
    sources: List[Union[str, Path]],
    force: bool = False,
    **kwargs,
) -> List[DatasetAddMetadata]:
    """Process file URLs for adding to a dataset."""
    if importer:
        return importer.download_files(destination=destination, extract=extract)

    if len(urls) == 0:
        raise errors.ParameterError("No URL is specified")
    if sources and len(urls) > 1:
        raise errors.ParameterError("Cannot use '--source' with multiple URLs.")

    files = []

    for url in urls:
        _, is_git = check_url(url)

        if not is_git and sources:
            raise errors.ParameterError("Cannot use '-s/--src/--source' with URLs or local files.")

        provider = ProviderFactory.get_add_provider(uri=url)

        new_files = provider.add(
            uri=url,
            destination=destination,
            revision=revision,
            sources=sources,
            dataset=dataset,
            extract=extract,
            force=force,
            **kwargs,
        )

        files.extend(new_files)

    return files


def _check_available_space(urls: List[str], total_size: Optional[int] = None):
    """Check that there is enough space available on the device for download."""
    if total_size is None:
        total_size = 0
        for url in urls:
            try:
                response = requests.head(url, allow_redirects=True)
                total_size += int(response.headers.get("content-length", 0))
            except errors.RequestError:
                pass
    usage = shutil.disk_usage(project_context.path)

    if total_size > usage.free:
        mb = 2**20
        message = "Insufficient disk space (required: {:.2f} MB" "/available: {:.2f} MB). ".format(
            total_size / mb, usage.free / mb
        )
        raise errors.OperationError(message)


def _create_destination_directory(dataset: Dataset, destination: Optional[Union[Path, str]] = None) -> Path:
    """Create directory for dataset add."""
    dataset_datadir = project_context.path / dataset.get_datadir()

    if dataset_datadir.is_symlink():
        dataset_datadir.unlink()

    # NOTE: Make sure that dataset's data dir exists because we check for existence of a destination later to decide
    # what will be its name
    dataset_datadir.mkdir(parents=True, exist_ok=True)

    destination = destination or ""
    relative_path = cast(str, get_relative_path(destination, base=dataset_datadir, strict=True))
    return dataset_datadir / relative_path


[docs]def filter_files(
    dataset: Dataset, files: List[DatasetAddMetadata], force: bool, overwrite: bool
) -> List[DatasetAddMetadata]:
    """Filter ignored and overwritten files."""

    def remove_git_files(files_to_filter: List[DatasetAddMetadata]):
        """Remove all files that are under a .git directory."""
        git_paths = [f.entity_path for f in files_to_filter if str(f.entity_path).startswith(".git")]
        if not git_paths:
            return files_to_filter

        communication.warn("Ignored adding paths under a .git directory:\n\t" + "\n\t".join(str(p) for p in git_paths))
        return [f for f in files_to_filter if f.entity_path not in git_paths]

    def check_ignored_files(files_to_filter: Generator[DatasetAddMetadata, None, None]):
        """Check if any files added were ignored."""
        paths = {f.get_absolute_commit_path(project_context.path): f for f in files_to_filter}

        ignored_paths = project_context.repository.get_ignored_paths(*paths)
        if ignored_paths:
            ignored_sources = [file.source for path, file in paths.items() if path in ignored_paths]

            communication.warn(
                "Theses paths are ignored by one of your .gitignore files (use '--force' flag if you really want to "
                "add them):\n\t" + "\n\t".join([str(p) for p in ignored_sources])
            )

        return (file for path, file in paths.items() if path not in ignored_paths)

    def check_existing_files(files_to_filter: Generator[DatasetAddMetadata, None, None]):
        """Check if files added already exist."""
        files_list = list(files_to_filter)
        existing_paths = [f.entity_path for f in files_list if dataset.find_file(f.entity_path)]
        if existing_paths:
            communication.warn(
                "These existing files were not overwritten (use '--overwrite' flag to overwrite them):\n\t"
                + "\n\t".join([str(p) for p in existing_paths])
            )

        return (f for f in files_list if f.entity_path not in existing_paths)

    files = remove_git_files(files)

    # NOTE: Don't filter ignored or existing files that will be added to a remote storage
    remote_files = (f for f in files if f.metadata_only)
    local_files = (f for f in files if not f.metadata_only)

    # NOTE: Data directory of datasets with a storage backend is always ignored, so, filtering files is meaningless
    if not force and not dataset.storage:
        local_files = check_ignored_files(local_files)

    if not overwrite:
        local_files = check_existing_files(local_files)

    files = list(itertools.chain(local_files, remote_files))

    return files


[docs]def get_dataset_file_path_within_dataset(dataset: Dataset, entity_path: Union[Path, str]) -> Path:
    """Return a dataset file's path relative to the dataset's datadir.

    NOTE: Dataset files can reside outside the dataset's datadir.
    """
    assert not os.path.isabs(entity_path), f"Entity path cannot be absolute: {entity_path}"

    entity_path = Path(entity_path)

    try:
        return entity_path.relative_to(dataset.get_datadir())
    except ValueError:
        return entity_path


[docs]def get_upload_uri(dataset: Dataset, entity_path: Union[Path, str]) -> str:
    """Return the remote storage path that a dataset file would be located.

    Args:
        dataset(Dataset): Dataset with a backend storage.
        entity_path(Union[Path, str]): Dataset file's path (entity path); it is relative to the project's root.

    Returns:
        str: URI within remote storage.
    """
    assert dataset.storage, "Cannot get URI for datasets with no backend storage"

    base = dataset.storage.rstrip("/")
    path_within_dataset = get_dataset_file_path_within_dataset(dataset=dataset, entity_path=entity_path)

    return f"{base}/{path_within_dataset}"


[docs]def move_files_to_dataset(dataset: Dataset, files: List[DatasetAddMetadata]):
    """Copy/Move files into a dataset's directory."""

    def move_file(file: DatasetAddMetadata, storage: Optional[IStorage]) -> bool:
        if not file.has_action:
            return False

        if file.action in (
            DatasetAddAction.COPY,
            DatasetAddAction.MOVE,
            DatasetAddAction.SYMLINK,
            DatasetAddAction.DOWNLOAD,
        ):
            # NOTE: Remove existing file if any; required as a safety-net to avoid corrupting external files
            delete_dataset_file(file.destination, follow_symlinks=True)
            file.destination.parent.mkdir(parents=True, exist_ok=True)

        track_in_lfs = True

        # NOTE: If file is in a sub-directory of a dataset's remote storage URI, only update the metadata
        if file.remote_storage:
            if dataset.storage and is_uri_subfolder(dataset.storage, file.url):
                file.action = DatasetAddAction.METADATA_ONLY
            else:
                file.action = DatasetAddAction.DOWNLOAD

        file_to_upload = file.source.resolve()

        try:
            if file.action == DatasetAddAction.COPY:
                shutil.copy(file.source, file.destination)
            elif file.action == DatasetAddAction.MOVE:
                shutil.move(file.source, file.destination, copy_function=shutil.copy)  # type: ignore
            elif file.action == DatasetAddAction.SYMLINK:
                create_external_file(target=file.source, path=file.destination)
                # NOTE: Don't track symlinks to external files in LFS
                track_in_lfs = False
            elif file.action == DatasetAddAction.DOWNLOAD:
                assert file.provider, f"Storage provider isn't set for {file} with DOWNLOAD action"
                download_storage = file.provider.get_storage()
                download_storage.download(file.url, file.destination)
                file_to_upload = file.destination
            elif file.metadata_only:
                # NOTE: Nothing to do when adding file to a dataset with a parent remote storage
                pass
            else:
                raise errors.OperationError(f"Invalid action {file.action}")
        except OSError as e:
            # NOTE: It's ok if copying data to a read-only mounted cloud storage fails
            if "Read-only file system" in str(e) and storage:
                pass
            else:
                raise

        # NOTE: We always copy the files to the dataset's data dir. If dataset has a storage backend, we also upload the
        # file to the remote storage.
        if storage:
            if file.metadata_only:
                assert file.based_on, f"wasBasedOn isn't set for {file} with METADATA_ONLY action"
                file_uri = file.based_on.url
                md5_hash = file.based_on.checksum
            else:
                file_uri = get_upload_uri(dataset=dataset, entity_path=file.entity_path)
                storage.upload(source=file_to_upload, uri=file_uri)
                md5_hash = hash_file(file_to_upload, hash_type="md5") or ""

            file.based_on = RemoteEntity(url=file_uri, path=file.entity_path, checksum=md5_hash)

        return track_in_lfs

    dataset_storage = None
    if dataset.storage:
        provider = ProviderFactory.get_storage_provider(uri=dataset.storage)
        dataset_storage = provider.get_storage()

    lfs_files = []

    for dataset_file in files:
        # TODO: Parallelize copy/download/upload
        if move_file(file=dataset_file, storage=dataset_storage):
            lfs_files.append(dataset_file.destination)

    if lfs_files and not dataset.storage:
        track_paths_in_storage(*lfs_files)


[docs]def add_files_to_repository(dataset: Dataset, files: List[DatasetAddMetadata]):
    """Track files in project's repository."""
    # NOTE: There is nothing to track for remote storages
    if dataset.storage:
        communication.info("Nothing to add to the project for datasets with a storage backend")
        return

    # NOTE: Don't commit files that will be uploaded to a remote storage
    paths_to_commit = [f.get_absolute_commit_path(project_context.path) for f in files]

    repository = project_context.repository

    # Force-add to include possible ignored files
    if len(paths_to_commit) > 0:
        repository.add(*paths_to_commit, project_context.pointers_path, force=True)

    n_staged_changes = len(repository.staged_changes)
    if n_staged_changes == 0:
        communication.warn("No new file was added to project")


[docs]def update_dataset_metadata(dataset: Dataset, files: List[DatasetAddMetadata], clear_files_before: bool):
    """Add newly-added files to the dataset's metadata."""
    dataset_files = []
    repo_paths: List[Union[Path, str]] = [
        file.entity_path for file in files if (project_context.path / file.entity_path).exists()
    ]

    checksums = project_context.repository.get_object_hashes(repo_paths)

    for file in files:
        dataset_file = DatasetFile.from_path(
            path=file.entity_path, source=file.url, based_on=file.based_on, checksum=checksums.get(file.entity_path)
        )
        dataset_files.append(dataset_file)

    if clear_files_before:
        dataset.clear_files()

    dataset.add_or_update_files(dataset_files)