Source code for renku.core.dataset.providers.doi

# -*- coding: utf-8 -*-
#
# Copyright 2017-2022 - Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""DOI API integration."""

import urllib
from pathlib import Path
from typing import Optional

from renku.core import errors
from renku.core.dataset.providers.api import ImporterApi, ProviderApi, ProviderPriority
from renku.core.util.doi import extract_doi, is_doi

DOI_BASE_URL = "https://dx.doi.org"


[docs]class DOIProvider(ProviderApi): """`doi.org <http://doi.org>`_ registry API provider.""" priority = ProviderPriority.HIGHER name = "DOI" def __init__(self, uri: Optional[str], headers=None, timeout=3): super().__init__(uri=uri) self.timeout = timeout self.headers = headers if headers is not None else {"accept": "application/vnd.citationstyles.csl+json"}
[docs] @staticmethod def supports(uri) -> bool: """Whether or not this provider supports a given URI.""" return bool(is_doi(uri))
[docs] def get_importer(self, **kwargs) -> "DOIImporter": """Get import manager.""" from renku.core.util import requests def query(doi): """Retrieve metadata for given doi.""" doi = extract_doi(doi) url = make_doi_url(doi) response = requests.get(url, headers=self.headers) if response.status_code != 200: raise LookupError("record not found. Status: {}".format(response.status_code)) return response def serialize(response): """Serialize HTTP response for DOI.""" json_data = response.json() data = {key.replace("-", "_").lower(): value for key, value in json_data.items()} try: return DOIImporter(**data) except TypeError: raise errors.DatasetImportError("doi metadata could not be serialized") query_response = query(self.uri) return serialize(query_response)
[docs]class DOIImporter(ImporterApi): """Response from `doi.org <http://doi.org>`_ for DOI metadata.""" def __init__( self, id, doi, url, abstract=None, author=None, categories=None, container_title=None, contributor=None, copyright=None, issued=None, language=None, publisher=None, title=None, type=None, version=None, ): super().__init__(uri=url, original_uri=url) self.id = id self.doi = doi self.abstract = abstract self.author = author self.categories = categories self.container_title = container_title self.contributor = contributor self.copyright = copyright self.issued = issued self.language = language self.publisher = publisher self.title = title self.type = type self._version = version @property def version(self) -> str: """Get record version.""" return self._version @property def latest_uri(self) -> str: """Get URI of the latest version.""" return self.uri
[docs] def fetch_provider_dataset(self): """Deserialize this record to a ``ProviderDataset``.""" raise NotImplementedError
[docs] def is_latest_version(self) -> bool: """Check if record is at last possible version.""" return True
[docs] def download_files(self, destination: Path, extract: bool): """Download dataset files from the remote provider.""" raise NotImplementedError
[docs] def tag_dataset(self, name: str) -> None: """Create a tag for the dataset ``name`` if the remote dataset has a tag/version.""" raise NotImplementedError
[docs] def copy_extra_metadata(self, new_dataset) -> None: """Copy provider specific metadata once the dataset is created.""" raise NotImplementedError
[docs]def make_doi_url(doi): """Create URL to access DOI metadata.""" parsed_url = urllib.parse.urlparse(doi) if parsed_url.scheme == "doi": parsed_url = parsed_url._replace(scheme="") doi = parsed_url.geturl() return urllib.parse.urljoin(DOI_BASE_URL, doi)