Source code for renku.core.models.entities

# -*- coding: utf-8 -*-
#
# Copyright 2018-2020- Swiss Data Science Center (SDSC)
# A partnership between École Polytechnique Fédérale de Lausanne (EPFL) and
# Eidgenössische Technische Hochschule Zürich (ETHZ).
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Represent provenance entities."""

import os
import pathlib
import weakref
from urllib.parse import quote, urljoin

import attr

from renku.core.models.calamus import JsonLDSchema, Nested, fields, prov, rdfs, renku, schema, wfprov
from renku.core.models.projects import Project, ProjectSchema


def _str_or_none(data):
    """Return str representation or None."""
    return str(data) if data is not None else data


@attr.s(cmp=False,)
class CommitMixin:
    """Represent a commit mixin."""

    commit = attr.ib(default=None, kw_only=True)
    client = attr.ib(default=None, kw_only=True)
    path = attr.ib(default=None, kw_only=True, converter=_str_or_none)

    _id = attr.ib(default=None, kw_only=True)
    _label = attr.ib(kw_only=True)
    _project = attr.ib(type=Project, kw_only=True, default=None)

    @property
    def submodules(self):
        """Proxy to client submodules."""
        if self.client:
            return self.client.submodules

    def default_id(self):
        """Configure calculated ID."""
        hexsha = self.commit.hexsha if self.commit else "UNCOMMITTED"
        return generate_file_id(client=self.client, hexsha=hexsha, path=self.path)

    @_label.default
    def default_label(self):
        """Generate a default label."""
        if self.commit:
            hexsha = self.commit.hexsha
        else:
            hexsha = "UNCOMMITTED"
        if self.path:
            path = self.path
            if self.client and os.path.isabs(path):
                path = pathlib.Path(path).relative_to(self.client.path)
            return generate_label(path, hexsha)
        return hexsha

    def __attrs_post_init__(self):
        """Post-init hook."""
        if self.path and self.client:
            path = pathlib.Path(self.path)
            if path.is_absolute():
                self.path = str(path.relative_to(self.client.path))

        # always force "project" to be the current project
        if self.client:
            self._project = self.client.project

        if not self._id:
            self._id = self.default_id()


[docs]@attr.s(cmp=False,) class Entity(CommitMixin): """Represent a data value or item.""" _parent = attr.ib( default=None, kw_only=True, converter=lambda value: weakref.ref(value) if value is not None else None, ) checksum = attr.ib(default=None, kw_only=True, type=str)
[docs] @classmethod def from_revision(cls, client, path, revision="HEAD", parent=None, find_previous=True, **kwargs): """Return dependency from given path and revision.""" if find_previous: revision = client.find_previous_commit(path, revision=revision) client, commit, path = client.resolve_in_submodules(revision, path,) path_ = client.path / path if path != "." and path_.is_dir(): entity = Collection(client=client, commit=commit, path=path, members=[], parent=parent,) files_in_commit = commit.stats.files # update members with commits for member in path_.iterdir(): if member.name == ".gitkeep": continue member_path = str(member.relative_to(client.path)) find_previous = True if member_path in files_in_commit: # we already know the newest commit, no need to look it up find_previous = False try: assert all(member_path != m.path for m in entity.members) entity.members.append( cls.from_revision( client, member_path, commit, parent=entity, find_previous=find_previous, **kwargs ) ) except KeyError: pass else: entity = cls(client=client, commit=commit, path=str(path), parent=parent, **kwargs) return entity
@property def parent(self): # pragma: no cover """Return the parent object.""" return self._parent() if self._parent is not None else None @property def entities(self): """Yield itself.""" if self.client and not self.commit and self._label and "@UNCOMMITTED" not in self._label: self.commit = self.client.repo.commit(self._label.rsplit("@")[1]) yield self
[docs] def set_client(self, client): """Sets the clients on this entity.""" self.client = client
[docs]@attr.s(cmp=False,) class Collection(Entity): """Represent a directory with files.""" members = attr.ib(kw_only=True, default=None)
[docs] def default_members(self): """Generate default members as entities from current path.""" if not self.client: return [] dir_path = self.client.path / self.path if not dir_path.exists(): # likely a directory deleted in a previous commit return [] assert dir_path.is_dir() members = [] for path in dir_path.iterdir(): if path.name == ".gitkeep": continue # ignore empty directories in Git repository cls = Collection if path.is_dir() else Entity members.append( cls(commit=self.commit, client=self.client, path=str(path.relative_to(self.client.path)), parent=self,) ) return members
@property def entities(self): """Recursively return all files.""" for member in self.members: if not member.client and self.client: member.client = self.client yield from member.entities if self.client and not self.commit and self._label and "@UNCOMMITTED" not in self._label: self.commit = self.client.repo.commit(self._label.rsplit("@")[1]) yield self
[docs] def set_client(self, client): """Sets the clients on this entity.""" super().set_client(client) for m in self.members: m.set_client(client)
def __attrs_post_init__(self): """Init members.""" super().__attrs_post_init__() if self.members is None: self.members = self.default_members() for member in self.members: member._parent = weakref.ref(self)
class CommitMixinSchema(JsonLDSchema): """CommitMixin schema.""" class Meta: """Meta class.""" model = CommitMixin path = fields.String(prov.atLocation) _id = fields.Id(init_name="id") _label = fields.String(rdfs.label, init_name="label", missing=None) _project = Nested(schema.isPartOf, ProjectSchema, init_name="project", missing=None) class EntitySchema(CommitMixinSchema): """Entity Schema.""" class Meta: """Meta class.""" rdf_type = [prov.Entity, wfprov.Artifact] model = Entity checksum = fields.String(renku.checksum, missing=None) class CollectionSchema(EntitySchema): """Entity Schema.""" class Meta: """Meta class.""" rdf_type = [prov.Collection] model = Collection members = Nested(prov.hadMember, [EntitySchema, "CollectionSchema"], many=True) def generate_label(path, hexsha): """Generate label field.""" return f"{path}@{hexsha}" def generate_file_id(client, hexsha, path): """Generate DatasetFile id field.""" # Determine the hostname for the resource URIs. # If RENKU_DOMAIN is set, it overrides the host from remote. # Default is localhost. host = "localhost" if client: host = client.remote.get("host") or host host = os.environ.get("RENKU_DOMAIN") or host # always set the id by the identifier return urljoin(f"https://{host}", pathlib.posixpath.join(f"/blob/{hexsha}/{quote(str(path))}"))