# -*- coding: utf-8 -*-
###
# © 2018 The Board of Trustees of the Leland Stanford Junior University
# Nathaniel Watson
# nathankw@stanford.edu
###
"""
Contains a ``Profile`` class for working with profiles on the ENCODE Portal. Note that
the terms 'profile' and 'schema' are used interchangeably in this package.
"""
import inflection
import logging
import requests
import encode_utils as eu
import encode_utils.utils as euu
#: A debug ``logging`` instance.
DEBUG_LOGGER = logging.getLogger(eu.DEBUG_LOGGER_NAME + "." + __name__)
#: An error ``logging`` instance.
ERROR_LOGGER = logging.getLogger(eu.ERROR_LOGGER_NAME + "." + __name__)
[docs]class UnknownProfile(Exception):
"""
Raised when the profile ID in question doesn't match any known profile ID.
"""
pass
class EncodeSchema:
def __init__(self, name, schema):
"""
Args:
name: `str`. The name of the schema, should be lowercase and `-`-separated.
schema: `dict`. The JSON representation of the schema on the portal.
"""
self.name = name
self.schema = schema
self._properties = None
self._non_writable_props = None
self._writable_props = None
@property
def properties(self):
"""
Returns:
`list[EncodeSchemaProperty]`: A list of properties in the schema
"""
if self._properties is None:
props = []
for prop_name, prop in self.schema["properties"].items():
is_identifying = prop_name in self.identifying_properties
is_required = prop_name in self.required_properties
props.append(
EncodeSchemaProperty(prop_name, prop, is_required, is_identifying)
)
self._properties = props
return self._properties
def get_property_from_name(self, name):
"""
Args:
name: `str`. The name of the property to search for.
Returns:
`EncodeSchemaProperty`: The property corresponding to `name`
Raises:
`ValueError` if a property with `name` is not found.
"""
for prop in self.properties:
if prop.name == name:
return prop
raise ValueError("Could not find property {} in schema".format(name))
@property
def identifying_properties(self):
"""
Returns:
`list`: A list of identifying property names
"""
return self.schema["identifyingProperties"]
@property
def has_award(self):
"""
Returns:
`bool`: Indicates if the schema has an `award` property present.
"""
return eu.AWARD_PROP_NAME in self.schema
@property
def has_alias(self):
"""
Returns:
`bool`: Indicates if the schema has an `alias` property present.
"""
return eu.ALIAS_PROP_NAME in self.schema
@property
def non_writable_props(self):
"""
A list of the property names that are non-writable. These are determined as
properties in the schema whose subschemas include the property
``Profile.NOT_SUBMITTABLE_FLAG`` or the property ``Profile.READ_ONLY_FLAG``.
"""
if self._non_writable_props is None:
for prop in self.properties:
if prop.is_not_submittable or prop.is_read_only:
self.non_writable_props.append(prop.name)
return self._non_writable_props
@property
def writable_props(self):
"""
A list of the property names that are writable, which are those that don't
fall into the self.non_writable_props category.
"""
if self._writable_props is None:
for prop in self.properties:
if not prop.is_not_submittable and not prop.is_read_only:
self.writable_props.append(prop.name)
return self._writable_props
@property
def required_properties(self):
"""
Returns the list of required properties to submit when creating a new record
under the given profile. Only works when the profile contains a "required" key
at the top level, as it is in the biosample profile. Doesn't at this time
recognize conditionally required keys that appear in a subschema, such as
'anyOf' as demonstrated in the file profile.
Returns:
`list`: The list of required properties.
"""
return self.schema.get("required", [])
def filter_non_writable_props(self, rec_json, keep_identifying=False):
"""
Filters out the non-writable properties from a record, using
``self.non_writable_props`` as a filtering basis.
Args:
rec_json: `dict`. The JSON serialization of a record that belongs to the
profile encapsulated through this instance.
keep_identifying: `bool`. Setting this to True means to retain keys that are
in the `identifyingProperties` object property of the schema.
Returns:
`dict`: The input minus any keys that aren't writable.
"""
for key in rec_json.keys():
prop = self.get_property_from_name(key)
if keep_identifying and prop.is_identifying:
continue
if key in self.non_writable_props:
rec_json.pop(key)
return rec_json
class EncodeSchemaProperty:
#: Constant storing the name of the property in a JSON object sub-schema that
#: indicates whether the object is read only.
READ_ONLY_FLAG = "readonly"
#: Constant storing the name of the property in a JSON object sub-schema that
#: indicates whether the object is submittable.
NOT_SUBMITTABLE_FLAG = "notSubmittable"
def __init__(self, prop_name, schema, is_required, is_identifying):
"""
Indicates whether the provided property name is one that a user can submit when
creating or updating a record.
Args:
prop_name: `str`. The name of the property in the schema
prop: `str`. The `dict` from the portal JSON schema defining the property
is_required: `bool`. Indicates whether or not the given property is required
is_identifying: `bool`. Indicates whether or not the given property is
identifying.
"""
self.name = prop_name
self.schema = schema
self.is_required = is_required
self.is_identifying = is_identifying
@property
def is_not_submittable(self):
"""
Indicates whether the provided property name is one that a user can submit when
creating or updating a record.
Returns:
`bool`: `True` if this is a non-submittable property, `False` otherwise.
"""
if self.NOT_SUBMITTABLE_FLAG in self.schema:
return True
return False
@property
def is_read_only(self):
"""
Indicates whether the provided property name is one that is read-only and hence
can't be modified by the end-user.
Returns:
`bool`: `True` if this is a read-only property, `False` otherwise.
"""
if self.READ_ONLY_FLAG in self.schema:
return True
return False
[docs]class Profiles:
"""
Encapsulates knowledge about the existing profiles on the Portal and contains useful
methods for working with a given profile.
A defining purpose of this class is to validate the profile ID specified in a POST
payload passed to ``encode_utils.connection.Connection.post()``. This class is used
to ensure that the profile specified there is a known profile on the Portal.
Args:
dcc_url: str. The portal URL being submitted to.
"""
#: Constant storing the `file.json` profile's ID.
#: This is asserted for inclusion in ``Profile.PROFILES``.
FILE_PROFILE_ID = "file"
#: Constant storing a property name of the `file.json` profile.
#: The stored name is asserted for inclusion in the set of `File` properties.
SUBMITTED_FILE_PROP_NAME = "submitted_file_name"
#: Constant storing a property name of the `file.json` profile.
#: The stored name is asserted for inclusion in the set of `File` properties.
MD5SUM_NAME_PROP_NAME = "md5sum"
#: Constant sotring a property name of the `file.json` profile.
FILE_SIZE_PROP_NAME = "file_size"
def __init__(self, dcc_url):
"""
Args:
dcc_url: `str`. The dcc_url as specified by Connection.dcc_mode.url.
"""
self.dcc_url = dcc_url
self._profiles = None
def _get_profiles(self):
"""
Creates a dictionary storing all public profiles on the Portal.
Returns:
`dict`: `dict` where each key is the profile's ID, and each value is a given
profile's JSON schema. Each key is extracted from the profile's `id`
property, after a little formatting first. The formatting works by removing
the '/profiles/' prefix and the '.json' suffix. For example, the value of
the `id` property for the `genetic_modification.json` profile is
`/profiles/genetic_modification.json`. The corresponding key in this `dict`
is `genetic_modification`.
"""
url = euu.url_join([self.dcc_url, eu.PROFILES_URL, "?format=json"])
profiles = requests.get(url,
timeout=eu.TIMEOUT,
headers=euu.REQUEST_HEADERS_JSON).json()
# Remove the "private" profiles, since these have differing semantics.
private_profiles = [x for x in profiles if x.startswith("_")] # i.e. _subtypes
for i in private_profiles:
# _subtypes should be the only one
profiles.pop(i)
if "@type" in profiles: # A pseudo profile that doesn't count.
profiles.pop("@type")
profile_id_hash = {} # Instead of name as key, profile ID is key.
for schema in profiles.values(): # i.e. name=GeneticModification
profile_id = schema["id"].split("/")[-1].split(".json")[0]
profile_id_hash[profile_id] = EncodeSchema(profile_id, schema)
return profile_id_hash
@property
def profiles(self):
"""
Constant (`dict`) set to the return value of the function
``self.get_profiles()``. See documentation there for details.
"""
if self._profiles is None:
self._profiles = self._get_profiles()
return self._profiles
[docs] def profiles_with_property(self, property_name):
"""
Returns a list of profile names that have a given property.
Args:
property_name: `str`. The name of the property.
Returns:
`list` of profile names.
"""
res = []
for profile_name in self.profiles:
if property_name in self.profiles[profile_name]["properties"]:
res.append(profile_name)
return res
[docs] def get_profile_from_id(self, at_id):
"""
Normalizes the `profile_id` so that it matches the format of the profile IDs
stored in ``self.profiles``, and ensures that the normalized profile ID is a
member of this list.
Args:
at_id: `str`. An `@id` from the portal, e.g. `/biosamples/ENCBS123ABC/`
Returns:
`str`: The normalized profile ID.
Raises:
UnknownProfile: The normalized profile ID is not a member of the list
`self.profiles`.
"""
profile_id = at_id.strip("/").split("/")[0].lower()
# Multi-word profile names are hypen-separated, i.e. genetic-modifications.
profile_id = profile_id.replace("-", "_")
profile_id = inflection.singularize(profile_id)
# There are some notable cases where the profile ID doesn't match what is used
# in a record's @id attribute. For example, the profile antibody_lot has records
# whose @id property looks like '/antibodies/ENCAB719MQZ' instead of the
# expected '/antibody_lots/ENCAB719MQZ'. The block below fixes such exceptions:
if profile_id == "antibody":
profile_id = "antibody_lot"
if profile_id == "publication_datum":
profile_id = "publication_data"
if profile_id not in self.profiles:
raise UnknownProfile("Unknown profile ID '{}'.".format(at_id))
return self.profiles[profile_id]
[docs] def remove_duplicate_associations(self, associations):
"""
Checks for duplicates in array properties containing string elements. Need to be
careful as some cases can be tricky, i.e.
['/documents/id1', 'id1']
Such a duplicate should be identified and removed, leaving us with ["id1"].
Args:
associations: `list`.
Returns:
Deduplicated `list`.
"""
for i in range(len(associations)):
val = associations[i]
if val.startswith("/"):
prefix = inflection.singularize(val.strip("/").split("/")[0])
if prefix in self.profiles:
associations[i] = val.strip("/").split("/")[-1]
return list(set(associations))