Source code for renku.core.dataset.providers.doi

# -*- coding: utf-8 -*-
#
# Copyright 2017-2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DOI API integration."""

import urllib
from pathlib import Path
from typing import Optional

from renku.core import errors
from renku.core.dataset.providers.api import ImporterApi, ProviderApi, ProviderPriority
from renku.core.util.doi import extract_doi, is_doi

DOI_BASE_URL = "https://dx.doi.org"


[docs]class DOIProvider(ProviderApi):
    """`doi.org <http://doi.org>`_ registry API provider."""

    priority = ProviderPriority.HIGHER
    name = "DOI"

    def __init__(self, uri: Optional[str], headers=None, timeout=3):
        super().__init__(uri=uri)

        self.timeout = timeout
        self.headers = headers if headers is not None else {"accept": "application/vnd.citationstyles.csl+json"}

[docs]    @staticmethod
    def supports(uri) -> bool:
        """Whether or not this provider supports a given URI."""
        return bool(is_doi(uri))

[docs]    def get_importer(self, **kwargs) -> "DOIImporter":
        """Get import manager."""
        from renku.core.util import requests

        def query(doi):
            """Retrieve metadata for given doi."""
            doi = extract_doi(doi)
            url = make_doi_url(doi)

            response = requests.get(url, headers=self.headers)

            if response.status_code != 200:
                raise LookupError("record not found. Status: {}".format(response.status_code))

            return response

        def serialize(response):
            """Serialize HTTP response for DOI."""
            json_data = response.json()
            data = {key.replace("-", "_").lower(): value for key, value in json_data.items()}
            try:
                return DOIImporter(**data)
            except TypeError:
                raise errors.DatasetImportError("doi metadata could not be serialized")

        query_response = query(self.uri)
        return serialize(query_response)


[docs]class DOIImporter(ImporterApi):
    """Response from `doi.org <http://doi.org>`_ for DOI metadata."""

    def __init__(
        self,
        id,
        doi,
        url,
        abstract=None,
        author=None,
        categories=None,
        container_title=None,
        contributor=None,
        copyright=None,
        issued=None,
        language=None,
        publisher=None,
        title=None,
        type=None,
        version=None,
    ):
        super().__init__(uri=url, original_uri=url)

        self.id = id
        self.doi = doi

        self.abstract = abstract
        self.author = author
        self.categories = categories
        self.container_title = container_title
        self.contributor = contributor
        self.copyright = copyright
        self.issued = issued
        self.language = language
        self.publisher = publisher
        self.title = title
        self.type = type
        self._version = version

    @property
    def version(self) -> str:
        """Get record version."""
        return self._version

    @property
    def latest_uri(self) -> str:
        """Get URI of the latest version."""
        return self.uri

[docs]    def fetch_provider_dataset(self):
        """Deserialize this record to a ``ProviderDataset``."""
        raise NotImplementedError

[docs]    def is_latest_version(self) -> bool:
        """Check if record is at last possible version."""
        return True

[docs]    def download_files(self, destination: Path, extract: bool):
        """Download dataset files from the remote provider."""
        raise NotImplementedError

[docs]    def tag_dataset(self, name: str) -> None:
        """Create a tag for the dataset ``name`` if the remote dataset has a tag/version."""
        raise NotImplementedError

[docs]    def copy_extra_metadata(self, new_dataset) -> None:
        """Copy provider specific metadata once the dataset is created."""
        raise NotImplementedError


[docs]def make_doi_url(doi):
    """Create URL to access DOI metadata."""
    parsed_url = urllib.parse.urlparse(doi)
    if parsed_url.scheme == "doi":
        parsed_url = parsed_url._replace(scheme="")
        doi = parsed_url.geturl()
    return urllib.parse.urljoin(DOI_BASE_URL, doi)