Dataset metadata
import json
import collections
import pathlib
import typing

import attr
from clldutils import licenses
from clldutils.misc import nfilter
from clldutils.markup import iter_markdown_tables

__all__ = ['Metadata', 'get_creators_and_contributors']



[docs]@attr.s class Metadata(object): """ Dataset metadata is used as follows: - it is (partly) elicited when creating a new dataset directory ... - ... and subsequently written to the directory ... - ... where it may be edited ("by hand") ... - ... and from where it is read when initializing a `Dataset` object. To add custom metadata fields for a dataset, - inherit from `Metadata`, - add more `attr.ib` s, - register the subclass with the dataset by assigning it to `cldfbench.Dataset.metadata_cls`. """ id = attr.ib( default=None, metadata=dict(elicit=True, required=True)) title = attr.ib( default=None, metadata=dict(elicit=True, required=True)) description = attr.ib( default=None) license = attr.ib( default=None, metadata=dict(elicit=True, required=True)) url = attr.ib( default=None, metadata=dict(elicit=True)) citation = attr.ib( default=None, metadata=dict(elicit=True, required=True)) @classmethod def elicit(cls) -> 'Metadata': """ Factory method, called when creating a new dataset directory. """ kw = {} for field in attr.fields(cls): if field.metadata.get('elicit', False): res = input('{0}: '.format( if (not res) and field.default is not attr.NOTHING: res = field.default kw[] = res return cls(**kw) @classmethod def from_file(cls, fname: pathlib.Path) -> 'Metadata': """ Factory method, called when instantiating a `Dataset` object. """ with'r', encoding='utf-8') as fp: try: return cls(**json.load(fp)) except json.decoder.JSONDecodeError as e: # pragma: no cover raise ValueError('Invalid JSON file: {}\n{}'.format(fname.resolve(), e)) def write(self, fname: pathlib.Path): with'w', encoding='utf-8') as fp: return json.dump(attr.asdict(self), fp, indent=4) @property def known_license(self) -> typing.Union[None, licenses.License]: if self.license: return licenses.find(self.license) @property def zenodo_license(self) -> str: if self.known_license and in LICENSES: return def common_props(self) -> typing.Dict[str, object]: """ The metadata as JSON-LD object suitable for inclusion in CLDF metadata. """ res = collections.OrderedDict() if self.title: res["dc:title"] = self.title if self.description: res["dc:description"] = self.description if self.citation: res["dc:bibliographicCitation"] = self.citation if self.url: res["dc:identifier"] = self.url if self.known_license: res['dc:license'] = self.known_license.url elif self.license: res['dc:license'] = self.license return res def markdown(self) -> str: lines = [ '# {0}\n'.format(self.title or 'Dataset {0}'.format(, '## How to cite\n\nIf you use these data please cite', ] if self.citation: lines.append('- the original source') lines.extend([" > {}".format(line) for line in self.citation.split('\n')]) lines.extend([ "- the derived dataset using the DOI of the " "[particular released version](../../releases/) you were using" ]) else: # pragma: no cover lines.extend([ "this dataset using the DOI of the " "[particular released version](../../releases/) you were using" ]) lines.append('\n## Description\n\n') if self.description: lines.append('{0}\n'.format(self.description)) if self.license: lines.append('This dataset is licensed under a %s license\n' % self.license) if self.url: lines.append('Available online at %s\n' % self.url) return '\n'.join(lines)
def get_creators_and_contributors(text, strict=True) -> typing.Tuple[list, list]: ctypes = {c.lower(): c for c in CONTRIBUTOR_TYPES} creators, contributors = [], [] # Read first table in try: header, rows = next(iter_markdown_tables(text)) except StopIteration: # pragma: no cover return creators, contributors for row in rows: row = {k.lower(): v for k, v in zip(header, row)} for role in nfilter([r.strip().lower() for r in row.get('role', '').split(',')]): c = {k: v for k, v in row.items() if k != 'role'} if role in {'author', 'creator', 'maintainer'}: if c not in creators: creators.append(c) else: if strict: c['type'] = ctypes[role] else: c['type'] = ctypes.get(role, 'Other') if c not in contributors: contributors.append(c) return creators, contributors