diff --git a/src/oaklib/cli.py b/src/oaklib/cli.py index 4e365ba6d..1192461ed 100644 --- a/src/oaklib/cli.py +++ b/src/oaklib/cli.py @@ -107,6 +107,7 @@ from oaklib.interfaces.summary_statistics_interface import SummaryStatisticsInterface from oaklib.interfaces.taxon_constraint_interface import TaxonConstraintInterface from oaklib.interfaces.text_annotator_interface import TextAnnotatorInterface +from oaklib.interfaces.usages_interface import UsagesInterface from oaklib.io.heatmap_writer import HeatmapWriter from oaklib.io.html_writer import HTMLWriter from oaklib.io.obograph_writer import write_graph @@ -1439,7 +1440,6 @@ def annotate( @output_type_option # TODO: the main output option uses a filelike object @click.option("-o", "--output", help="Path to output file") -# @output_option def viz( terms, predicates, @@ -4181,6 +4181,77 @@ def apply_taxon_constraints( writer.emit(st) +@main.command() +@output_option +@autolabel_option +@output_type_option +@output_option +@click.option( + "--used-by-prefix", + "-P", + multiple=True, +) +@click.argument("terms", nargs=-1) +def usages( + terms, + autolabel: bool, + output_type: str, + output: str, + used_by_prefix: List, + **kwargs, +): + """ + List usages of a term or set of terms. + + Usages of neuron in GO: + + runoak -i sqlite:obo:go usages CL:0000540 + + Association/annotations sources can also be used: + + runoak -i quickgo: usages GO:0031969 + + Note this query may be slow - you can restrict to a species: + + runoak -i quickgo:NCBITaxon:9606 usages GO:0031969 + + (this should return no results, as there should be no human proteins annotated + to chloroplast membrane) + + Using amigo: + + runoak -i amigo: usages GO:0031969 + + Using ubergraph: + + runoak -i ubergraph: usages CL:0000540 + + This will include usages over multiple ontologies + + You can multiple queries over multiple sources (an AggregatorImplementation): + + runoak -i sqlite:obo:go -a ubergraph: -a amigo: -a quickgo: usages GO:0031969 + + """ + impl = settings.impl + writer = _get_writer(output_type, impl, StreamingCsvWriter) + writer.autolabel = autolabel + writer.output = output + if not isinstance(impl, UsagesInterface): + raise NotImplementedError( + f"Cannot execute this using {settings.impl} of type {type(settings.impl)}" + ) + used_by = None + if "@" in terms: + ix = terms.index("@") + curies = list(query_terms_iterator(terms[:ix], impl)) + used_by = terms[ix + 1 :] + else: + curies = list(query_terms_iterator(terms, impl)) + for usage in impl.usages(curies, used_by=used_by, used_by_prefixes=used_by_prefix, **kwargs): + writer.emit(usage) + + @main.command() @output_option @predicates_option @@ -4662,6 +4733,7 @@ def apply_labels(group): @output_option @click.option( "--ontology-only/--no-ontology-only", + "-T", default=False, show_default=True, help="If true, perform a pseudo-enrichment analysis treating each term as an association to itself.", @@ -4750,14 +4822,20 @@ def enrichment( actual_association_predicates = _process_predicates_arg(association_predicates) if sample_file: subjects = list(curies_from_file(sample_file, adapter=impl, allow_labels=allow_labels)) + curies = list(query_terms_iterator(terms, impl)) else: if "@" in terms: + if not ontology_only: + raise ValueError("Cannot use @ with --no-ontology-only") ix = terms.index("@") logging.info(f"Splitting terms into two, position = {ix}") subjects = list(query_terms_iterator(terms[0:ix], impl)) - terms = terms[ix + 1 :] + curies = list(query_terms_iterator(terms[ix + 1 :], impl)) + logging.info(f"Num Subjects={len(subjects)} (using {len(curies)} terms)") else: subjects = list(query_terms_iterator(terms, impl)) + curies = None + logging.info(f"Num Subjects={len(subjects)} (using all terms)") if not subjects: raise ValueError("No terms or upload provided") background = ( @@ -4778,7 +4856,6 @@ def enrichment( writer = _get_writer(output_type, impl, StreamingYamlWriter) writer.autolabel = autolabel writer.output = output - curies = list(query_terms_iterator(terms, impl)) results = impl.enriched_classes( subjects, predicates=actual_association_predicates, diff --git a/src/oaklib/conf/obograph-style.json b/src/oaklib/conf/obograph-style.json index 63eb9a653..91bba2066 100644 --- a/src/oaklib/conf/obograph-style.json +++ b/src/oaklib/conf/obograph-style.json @@ -70,6 +70,12 @@ "penwidth": 2, "label": "ⓗ" }, + "RO:0002333": { + "arrowhead": "box", + "color": "green", + "penwidth": 2, + "label": "ⓔ" + }, "RO:0004009": { "arrowhead": "box", "color": "green", @@ -97,6 +103,21 @@ "color": "red", "label": "⇩" }, + "RO:0002233": { + "arrowhead": "diamond", + "color": "blue", + "label": "→⊚" + }, + "RO:0002234": { + "arrowhead": "diamond", + "color": "blue", + "label": "⊚→" + }, + "RO:0002400": { + "arrowhead": "diamond", + "color": "blue", + "label": "⇨⊚" + }, "RO:0002220": { "color": "gray", "label": "A" diff --git a/src/oaklib/datamodels/vocabulary.py b/src/oaklib/datamodels/vocabulary.py index eb1f845dc..922fdbca2 100644 --- a/src/oaklib/datamodels/vocabulary.py +++ b/src/oaklib/datamodels/vocabulary.py @@ -117,6 +117,7 @@ LOCATED_IN = "RO:0001025" DEVELOPS_FROM = "RO:0002202" HAS_PART = "BFO:0000051" +OCCURS_IN = "BFO:0000066" ONLY_IN_TAXON = "RO:0002160" NEVER_IN_TAXON = "RO:0002161" IN_TAXON = "RO:0002162" @@ -124,6 +125,11 @@ NEGATIVELY_REGULATES = "RO:0002212" POSITIVELY_REGULATES = "RO:0002213" REGULATES = "RO:0002211" +ENABLES = "RO:0002327" +ENABLED_BY = "RO:0002333" +HAS_DIRECT_INPUT = "RO:0002400" +HAS_INPUT = "RO:0002233" +HAS_OUTPUT = "RO:0002234" BIOLOGICAL_PROCESS = "GO:0008150" CELLULAR_COMPONENT = "GO:0005575" diff --git a/src/oaklib/implementations/__init__.py b/src/oaklib/implementations/__init__.py index 31e66d894..b3e219e51 100644 --- a/src/oaklib/implementations/__init__.py +++ b/src/oaklib/implementations/__init__.py @@ -12,13 +12,13 @@ from oaklib.implementations.agrkb.agrkb_implementation import AGRKBImplementation from oaklib.implementations.amigo.amigo_implementation import AmiGOImplementation from oaklib.implementations.cx.cx_implementation import CXImplementation -from oaklib.implementations.eutils.pubmed_implementation import PubMedImplementation from oaklib.implementations.funowl.funowl_implementation import FunOwlImplementation from oaklib.implementations.gilda import GildaImplementation from oaklib.implementations.kgx.kgx_implementation import KGXImplementation from oaklib.implementations.llm_implementation import LLMImplementation from oaklib.implementations.monarch.monarch_implementation import MonarchImplementation from oaklib.implementations.ncbi.ncbi_gene_implementation import NCBIGeneImplementation +from oaklib.implementations.ncbi.pubmed_implementation import PubMedImplementation from oaklib.implementations.ols import ( BaseOlsImplementation, OlsImplementation, @@ -44,6 +44,7 @@ PantherDBImplementation, ) from oaklib.implementations.pronto.pronto_implementation import ProntoImplementation +from oaklib.implementations.quickgo.quickgo_implementation import QuickGOImplementation from oaklib.implementations.semsimian.semsimian_implementation import ( SemSimianImplementation, ) @@ -86,6 +87,7 @@ "NCBIGeneImplementation", "OntobeeImplementation", "ProntoImplementation", + "QuickGOImplementation", "SimpleOboImplementation", "SqlImplementation", "UbergraphImplementation", diff --git a/src/oaklib/implementations/aggregator/aggregator_implementation.py b/src/oaklib/implementations/aggregator/aggregator_implementation.py index 758c7f8e0..2e991512e 100644 --- a/src/oaklib/implementations/aggregator/aggregator_implementation.py +++ b/src/oaklib/implementations/aggregator/aggregator_implementation.py @@ -1,7 +1,7 @@ from collections import defaultdict from dataclasses import dataclass from io import TextIOWrapper -from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple +from typing import Any, Callable, Dict, Iterable, Iterator, List, Optional, Tuple, Type from sssom_schema import Mapping @@ -12,6 +12,7 @@ ValidationConfiguration, ValidationResult, ) +from oaklib.interfaces.association_provider_interface import AssociationProviderInterface from oaklib.interfaces.basic_ontology_interface import ( ALIAS_MAP, DEFINITION, @@ -25,12 +26,14 @@ from oaklib.interfaces.relation_graph_interface import RelationGraphInterface from oaklib.interfaces.search_interface import SearchInterface from oaklib.interfaces.text_annotator_interface import TEXT, TextAnnotatorInterface +from oaklib.interfaces.usages_interface import UsagesInterface from oaklib.interfaces.validator_interface import ValidatorInterface from oaklib.types import CURIE, SUBSET_CURIE @dataclass class AggregatorImplementation( + AssociationProviderInterface, ValidatorInterface, RdfInterface, RelationGraphInterface, @@ -38,6 +41,7 @@ class AggregatorImplementation( SearchInterface, MappingProviderInterface, TextAnnotatorInterface, + UsagesInterface, ): """ An OAK adapter that wraps multiple implementations and integrates results together. @@ -78,10 +82,20 @@ class AggregatorImplementation( implementations: List[BasicOntologyInterface] = None - def _delegate_iterator(self, func: Callable) -> Iterator: + @property + def implementation_name(self): + impl_names = [] for i in self.implementations: - for v in func(i): - yield v + impl_names.append(i.implementation_name) + return "-".join(impl_names) + + def _delegate_iterator( + self, func: Callable, interface: Optional[Type[BasicOntologyInterface]] = None + ) -> Iterator: + for i in self.implementations: + if interface is None or isinstance(i, interface): + for v in func(i): + yield v def _delegate_simple_tuple_map(self, func: Callable, strict=False) -> Dict[Any, List[Any]]: m = defaultdict(list) @@ -107,11 +121,21 @@ def validate(self, configuration: ValidationConfiguration = None) -> Iterable[Va def entities(self, **kwargs) -> Iterable[CURIE]: return self._delegate_iterator(lambda i: i.entities(**kwargs)) + def relationships(self, *args, **kwargs) -> Iterable[CURIE]: + return self._delegate_iterator(lambda i: i.relationships(*args, **kwargs)) + def simple_mappings_by_curie(self, curie: CURIE) -> Iterable[Tuple[PRED_CURIE, CURIE]]: return self._delegate_iterator(lambda i: i.simple_mappings_by_curie(curie)) def get_sssom_mappings_by_curie(self, curie: CURIE) -> Iterable[Mapping]: - return self._delegate_iterator(lambda i: i.get_sssom_mappings_by_curie(curie)) + return self._delegate_iterator( + lambda i: i.get_sssom_mappings_by_curie(curie), MappingProviderInterface + ) + + def sssom_mappings(self, *args, **kwargs) -> Iterable[Mapping]: + return self._delegate_iterator( + lambda i: i.sssom_mappings(*args, **kwargs), MappingProviderInterface + ) def label(self, curie: CURIE, **kwargs) -> str: return self._delegate_first(lambda i: i.label(curie, **kwargs)) @@ -151,6 +175,11 @@ def outgoing_relationship_map(self, curie: CURIE) -> RELATIONSHIP_MAP: def incoming_relationship_map(self, curie: CURIE) -> RELATIONSHIP_MAP: return self._delegate_simple_tuple_map(lambda i: i.incoming_relationship_map(curie)) + def associations(self, *args, **kwargs) -> Iterable[CURIE]: + return self._delegate_iterator( + lambda i: i.associations(*args, **kwargs), AssociationProviderInterface + ) + def annotate_text( self, text: TEXT, configuration: Optional[TextAnnotationConfiguration] = None ) -> Iterable[TextAnnotation]: diff --git a/src/oaklib/implementations/amigo/amigo_implementation.py b/src/oaklib/implementations/amigo/amigo_implementation.py index c638213cc..04bc65ee6 100644 --- a/src/oaklib/implementations/amigo/amigo_implementation.py +++ b/src/oaklib/implementations/amigo/amigo_implementation.py @@ -21,6 +21,7 @@ ] from oaklib.interfaces.basic_ontology_interface import LANGUAGE_TAG, RELATIONSHIP +from oaklib.interfaces.usages_interface import UsagesInterface from oaklib.types import CURIE, PRED_CURIE, SUBSET_CURIE from oaklib.utilities.iterator_utils import chunk @@ -130,6 +131,7 @@ def _normalize(curie: CURIE) -> CURIE: class AmiGOImplementation( AssociationProviderInterface, SearchInterface, + UsagesInterface, ): """ Wraps AmiGO endpoint. diff --git a/src/oaklib/implementations/eutils/eutils_implementation.py b/src/oaklib/implementations/ncbi/eutils_implementation.py similarity index 82% rename from src/oaklib/implementations/eutils/eutils_implementation.py rename to src/oaklib/implementations/ncbi/eutils_implementation.py index 511b84dd8..35f1970f3 100644 --- a/src/oaklib/implementations/eutils/eutils_implementation.py +++ b/src/oaklib/implementations/ncbi/eutils_implementation.py @@ -5,6 +5,7 @@ from dataclasses import dataclass, field from typing import ClassVar, Iterable, Iterator, Optional, Tuple +import requests_cache from eutils import Client __all__ = [ @@ -22,6 +23,8 @@ logger = logging.getLogger(__name__) +NCBI_REQUESTS_CACHE = ".ncbi_requests_cache" + @dataclass class EUtilsImplementation(OboGraphInterface, ABC): @@ -30,10 +33,20 @@ class EUtilsImplementation(OboGraphInterface, ABC): """ entrez_client: Client = field(default_factory=lambda: Client()) + # 0.6.0 release in 2019 - considered switching to direct API calls? database: ClassVar[Optional[str]] = None entity_type: ClassVar[Optional[str]] = None + # alternative to entrez_client + _requests_session: requests_cache.CachedSession = None + + @property + def requests_session(self): + if self._requests_session is None: + self._requests_session = requests_cache.CachedSession(NCBI_REQUESTS_CACHE) + return self._requests_session + def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]: if lang is not None: raise NotImplementedError("lang not implemented for eutils") diff --git a/src/oaklib/implementations/ncbi/ncbi_gene_implementation.py b/src/oaklib/implementations/ncbi/ncbi_gene_implementation.py index 2c201d43e..40d1b6720 100644 --- a/src/oaklib/implementations/ncbi/ncbi_gene_implementation.py +++ b/src/oaklib/implementations/ncbi/ncbi_gene_implementation.py @@ -8,9 +8,10 @@ import requests from oaklib.constants import TIMEOUT_SECONDS -from oaklib.datamodels.association import Association +from oaklib.datamodels.association import Association, NegatedAssociation from oaklib.datamodels.search import SearchConfiguration from oaklib.datamodels.vocabulary import RDFS_LABEL +from oaklib.implementations.ncbi.eutils_implementation import EUtilsImplementation from oaklib.interfaces import SearchInterface from oaklib.interfaces.association_provider_interface import ( AssociationProviderInterface, @@ -25,6 +26,7 @@ EFETCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi" ESEARCH_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi" +ELINK_URL = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi" logger = logging.getLogger(__name__) @@ -32,6 +34,7 @@ @dataclass class NCBIGeneImplementation( AssociationProviderInterface, + EUtilsImplementation, SearchInterface, ): """ @@ -76,6 +79,27 @@ def labels( yield curie, self.property_cache.get(curie, RDFS_LABEL) def associations( + self, + subjects: Iterable[CURIE] = None, + predicates: Iterable[PRED_CURIE] = None, + objects: Iterable[CURIE] = None, + **kwargs, + ) -> Iterator[Association]: + if subjects is not None: + subjects = list(subjects) + if predicates is not None: + predicates = list(predicates) + if objects is not None: + objects = list(objects) + # TODO: branch logic based on predicates if specified + yield from self.go_associations( + subjects=subjects, predicates=predicates, objects=objects, **kwargs + ) + yield from self.pubmed_associations( + subjects=subjects, predicates=predicates, objects=objects, **kwargs + ) + + def go_associations( self, subjects: Iterable[CURIE] = None, predicates: Iterable[PRED_CURIE] = None, @@ -88,11 +112,26 @@ def associations( add_closure_fields: bool = False, **kwargs, ) -> Iterator[Association]: + """ + Extracts GO associations from the NCBIGene database. + + :param subjects: + :param predicates: + :param objects: + :param property_filter: + :param subject_closure_predicates: + :param predicate_closure_predicates: + :param object_closure_predicates: + :param include_modified: + :param add_closure_fields: + :param kwargs: + :return: + """ logging.info(f"SUBJS: {subjects}") if subjects: subjects = list(subjects) if not subjects: - raise ValueError("NCBIGene requires subjects") + raise ValueError("NCBIGene requires subjects to be specified") for subject in subjects: if subject.startswith("NCBIGene:"): @@ -106,14 +145,76 @@ def associations( "id": gene_id, "retmode": "xml", } - response = requests.get(EFETCH_URL, params, timeout=TIMEOUT_SECONDS) + response = self.requests_session.get(EFETCH_URL, params, timeout=TIMEOUT_SECONDS) # Parsing the XML file root = ElementTree.fromstring(response.content) # noqa S314 - yield from self.associations_from_xml(subject, root) + yield from self._go_associations_from_xml(subject, root) + + def pubmed_associations( + self, + subjects: Iterable[CURIE] = None, + predicates: Iterable[PRED_CURIE] = None, + objects: Iterable[CURIE] = None, + property_filter: Dict[PRED_CURIE, Any] = None, + subject_closure_predicates: Optional[List[PRED_CURIE]] = None, + predicate_closure_predicates: Optional[List[PRED_CURIE]] = None, + object_closure_predicates: Optional[List[PRED_CURIE]] = None, + include_modified: bool = False, + add_closure_fields: bool = False, + **kwargs, + ) -> Iterator[Association]: + """ + Extracts pubmed associations from the NCBIGene database. + + :param subjects: + :param predicates: + :param objects: + :param property_filter: + :param subject_closure_predicates: + :param predicate_closure_predicates: + :param object_closure_predicates: + :param include_modified: + :param add_closure_fields: + :param kwargs: + :return: + """ + logging.info(f"SUBJS: {subjects}") + if subjects: + subjects = list(subjects) + if not subjects: + raise ValueError("NCBIGene requires subjects to be specified") + + for subject in subjects: + if subject.startswith("NCBIGene:"): + gene_id = subject.split(":")[1] + elif subject.isnumeric(): + gene_id = subject + else: + raise ValueError("NCBIGene requires subjects to be NCBIGene CURIEs or numbers") + params = { + "dbfrom": "gene", # Link from Gene database + "db": "pubmed", # Link to PubMed database + "id": gene_id, # Gene ID (as number) + "linkname": "gene_pubmed", # Predefined link name for gene to PubMed + } + response = requests.get(ELINK_URL, params, timeout=TIMEOUT_SECONDS) + + # Parsing the XML file + root = ElementTree.fromstring(response.text) # noqa S314 + pmids = [linksetdb.text for linksetdb in root.findall(".//Link/Id")] + + for pmid in pmids: + yield Association( + subject=subject, + # subject_label=gene_symbol, + predicate="biolink:mentioned_by", + object=f"PMID:{pmid}", + # object_label=go_label, + ) - def associations_from_xml(self, subject, root): + def _go_associations_from_xml(self, subject, root): """ Extracts associations from the XML file @@ -152,10 +253,16 @@ def associations_from_xml(self, subject, root): evidence = other_source.find("Other-source_post-text").text evidence = evidence.split(": ")[1] # Adding the extracted information to the list - assoc = Association( + preds = predicate.split(" ") + main_predicate = preds[-1] + cls = Association + if len(preds) > 1: + if preds[0] == "NOT": + cls = NegatedAssociation + assoc = cls( subject=subject, subject_label=gene_symbol, - predicate=predicate, + predicate=f"biolink:{main_predicate}", object=go_id, object_label=go_label, evidence_type=evidence, diff --git a/src/oaklib/implementations/eutils/pubmed_implementation.py b/src/oaklib/implementations/ncbi/pubmed_implementation.py similarity index 78% rename from src/oaklib/implementations/eutils/pubmed_implementation.py rename to src/oaklib/implementations/ncbi/pubmed_implementation.py index b8e389227..b477b6faa 100644 --- a/src/oaklib/implementations/eutils/pubmed_implementation.py +++ b/src/oaklib/implementations/ncbi/pubmed_implementation.py @@ -10,7 +10,7 @@ from dataclasses import dataclass from oaklib.datamodels import obograph -from oaklib.implementations.eutils.eutils_implementation import EUtilsImplementation +from oaklib.implementations.ncbi.eutils_implementation import EUtilsImplementation __all__ = [ "PubMedImplementation", @@ -37,7 +37,10 @@ def node( **kwargs, ) -> obograph.Node: ec = self.entrez_client + if ":" in curie and not curie.startswith("PMID:"): + return obograph.Node(id=curie, lbl=curie) local_id = curie.replace("PMID:", "") + logger.info(f"Fetching {local_id} from {self.database}") paset = ec.efetch(db=self.database, id=local_id) for pa in paset: n = obograph.Node( diff --git a/src/oaklib/implementations/ontobee/ontobee_implementation.py b/src/oaklib/implementations/ontobee/ontobee_implementation.py index 26cdcd168..f3b3df3bc 100644 --- a/src/oaklib/implementations/ontobee/ontobee_implementation.py +++ b/src/oaklib/implementations/ontobee/ontobee_implementation.py @@ -7,13 +7,18 @@ from oaklib.interfaces.mapping_provider_interface import MappingProviderInterface from oaklib.interfaces.obograph_interface import OboGraphInterface from oaklib.interfaces.search_interface import SearchInterface +from oaklib.interfaces.usages_interface import UsagesInterface ONTOBEE_MERGED_GRAPH_PREFIX = "http://purl.obolibrary.org/obo/merged/" @dataclass class OntobeeImplementation( - AbstractSparqlImplementation, SearchInterface, MappingProviderInterface, OboGraphInterface + AbstractSparqlImplementation, + SearchInterface, + MappingProviderInterface, + OboGraphInterface, + UsagesInterface, ): """ An OAK adapter that standardizes access to the Ontobee sparql endpoint. diff --git a/src/oaklib/implementations/quickgo/__init__.py b/src/oaklib/implementations/quickgo/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/src/oaklib/implementations/quickgo/quickgo_implementation.py b/src/oaklib/implementations/quickgo/quickgo_implementation.py new file mode 100644 index 000000000..438db56ab --- /dev/null +++ b/src/oaklib/implementations/quickgo/quickgo_implementation.py @@ -0,0 +1,180 @@ +import logging +import time +from dataclasses import dataclass +from typing import Any, Dict, Iterable, Iterator, List, Mapping, Optional + +import requests_cache + +from oaklib.datamodels import obograph +from oaklib.datamodels.association import Association +from oaklib.interfaces import OboGraphInterface, SearchInterface +from oaklib.interfaces.association_provider_interface import ( + AssociationProviderInterface, +) +from oaklib.interfaces.basic_ontology_interface import ( + LANGUAGE_TAG, + RELATIONSHIP, +) +from oaklib.interfaces.usages_interface import UsagesInterface +from oaklib.types import CURIE, PRED_CURIE + +logger = logging.getLogger(__name__) + + +QUICKGO_REQUESTS_CACHE = ".quickgo_requests_cache" + + +BASE_URL = "https://www.ebi.ac.uk/QuickGO/services" + + +@dataclass +class QuickGOImplementation( + OboGraphInterface, + AssociationProviderInterface, + SearchInterface, + UsagesInterface, +): + _requests_session: requests_cache.CachedSession = None + + _source: str = None + + def __post_init__(self): + self._source = self.resource.slug + + def requests_session(self): + if self._requests_session is None: + self._requests_session = requests_cache.CachedSession(QUICKGO_REQUESTS_CACHE) + return self._requests_session + + def associations( + self, + subjects: Iterable[CURIE] = None, + predicates: Iterable[PRED_CURIE] = None, + objects: Iterable[CURIE] = None, + property_filter: Dict[PRED_CURIE, Any] = None, + subject_closure_predicates: Optional[List[PRED_CURIE]] = None, + predicate_closure_predicates: Optional[List[PRED_CURIE]] = None, + object_closure_predicates: Optional[List[PRED_CURIE]] = None, + include_modified: bool = False, + **kwargs, + ) -> Iterator[Association]: + session = self.requests_session() + if subjects and not isinstance(subjects, list): + subjects = list(subjects) + if objects and not isinstance(objects, list): + objects = list(objects) + url = f"{BASE_URL}/annotation/search" + + params = {"includeFields": "goName,name"} + # QuickGO also allows querying miRNA, for now limit searches to gene-centric subsets + params["proteome"] = "gcrpCan" + params["geneProductType"] = "protein" + if subjects: + subjects = [x.replace("UniProtKB:", "") for x in subjects] + params["geneProductId"] = ",".join(subjects) + if objects: + params["goId"] = ",".join(objects) + if object_closure_predicates: + params["goUsage"] = "descendants" + else: + params["goUsage"] = "exact" + if predicates: + params["goUsageRelationships"] = ",".join(predicates) + if self._source: + params["taxonId"] = self._source.replace("NCBITaxon:", "") + + def _xrefs(x: Any) -> Iterator[str]: + if x is None: + return + if isinstance(x, list): + for y in x: + yield from _xrefs(y) + elif "connectedXrefs" in x: + yield from _xrefs(x["connectedXrefs"]) + else: + yield f"{x['db']}:{x['id']}" + + def gen_assoc(result: dict) -> Association: + qualifier = result.get("qualifier", None) + if "goId" not in result: + raise ValueError(f"Missing goId in {result}") + with_from = list(_xrefs(result["withFrom"])) + return Association( + subject=result["geneProductId"], + subject_label=result["symbol"], + predicate=f"biolink:{qualifier}", + object=result["goId"], + object_label=result["goName"], + evidence_type=result["evidenceCode"], + publications=[result["reference"]], + supporting_objects=with_from, + ) + + page = 0 + while True: + page += 1 + params["page"] = page + response = session.get(url, params=params) + + response.raise_for_status() + obj = response.json() + results = obj["results"] + + for result in results: + yield gen_assoc(result) + + page_info = obj["pageInfo"] + num_rows = page_info["resultsPerPage"] * page + if num_rows >= page_info["total"]: + break + else: + time.sleep(0.1) + + def node( + self, curie: CURIE, strict=False, include_metadata=False, expand_curies=False + ) -> obograph.Node: + session = self.requests_session() + q = curie.replace("UniProtKB:", "") + url = f"{BASE_URL}/geneproduct/{q}" + response = session.get(url) + if response.status_code == 500 and not strict: + return obograph.Node(id=curie) + response.raise_for_status() + obj = response.json() + results = obj["results"] + if not results: + return obograph.Node(id=curie) + result = results[0] + node = obograph.Node( + id=curie, + lbl=result["symbol"], + meta=None, + ) + return node + + def label(self, curie: CURIE, lang: Optional[LANGUAGE_TAG] = None) -> Optional[str]: + try: + node = self.node(curie) + if node: + return node.lbl + except ValueError: + return None + + def relationships( + self, + subjects: Iterable[CURIE] = None, + predicates: Iterable[PRED_CURIE] = None, + objects: Iterable[CURIE] = None, + include_tbox: bool = True, + include_abox: bool = True, + include_entailed: bool = False, + exclude_blank: bool = True, + ) -> Iterator[RELATIONSHIP]: + for x in []: + yield x + return + + def sssom_mappings(self, *args, **kwargs) -> Iterable[Mapping]: + for x in []: + yield x + return diff --git a/src/oaklib/implementations/sqldb/sql_implementation.py b/src/oaklib/implementations/sqldb/sql_implementation.py index b4bbeae06..ba511b644 100644 --- a/src/oaklib/implementations/sqldb/sql_implementation.py +++ b/src/oaklib/implementations/sqldb/sql_implementation.py @@ -149,6 +149,7 @@ from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface from oaklib.interfaces.summary_statistics_interface import SummaryStatisticsInterface from oaklib.interfaces.taxon_constraint_interface import TaxonConstraintInterface +from oaklib.interfaces.usages_interface import UsagesInterface from oaklib.interfaces.validator_interface import ValidatorInterface from oaklib.types import CATEGORY_CURIE, CURIE, SUBSET_CURIE from oaklib.utilities.axioms.logical_definition_utilities import ( @@ -278,6 +279,7 @@ class SqlImplementation( OwlInterface, DumperInterface, MergeInterface, + UsagesInterface, ): """ A :class:`OntologyInterface` implementation that wraps a SQL Relational Database. diff --git a/src/oaklib/implementations/ubergraph/ubergraph_implementation.py b/src/oaklib/implementations/ubergraph/ubergraph_implementation.py index b650f60b5..42baa4b11 100644 --- a/src/oaklib/implementations/ubergraph/ubergraph_implementation.py +++ b/src/oaklib/implementations/ubergraph/ubergraph_implementation.py @@ -23,6 +23,7 @@ from oaklib.interfaces.relation_graph_interface import RelationGraphInterface from oaklib.interfaces.search_interface import SearchInterface from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface +from oaklib.interfaces.usages_interface import UsagesInterface from oaklib.types import CURIE, PRED_CURIE from oaklib.utilities.graph.networkx_bridge import transitive_reduction_by_predicate from oaklib.utilities.semsim.similarity_utils import setwise_jaccard_similarity @@ -53,6 +54,7 @@ class UbergraphImplementation( MappingProviderInterface, SemanticSimilarityInterface, SubsetterInterface, + UsagesInterface, ): """ Wraps the Ubergraph sparql endpoint diff --git a/src/oaklib/interfaces/association_provider_interface.py b/src/oaklib/interfaces/association_provider_interface.py index 225178d04..ed86d0662 100644 --- a/src/oaklib/interfaces/association_provider_interface.py +++ b/src/oaklib/interfaces/association_provider_interface.py @@ -196,7 +196,7 @@ def associations( raise NotImplementedError ix = self._association_index if ix is None: - logging.warning(f"No association index for {type(self)}") + logging.info(f"No association index for {type(self)}") return yield from ix.lookup(subjects, predicates, objects) diff --git a/src/oaklib/interfaces/basic_ontology_interface.py b/src/oaklib/interfaces/basic_ontology_interface.py index 22038ab30..78dbe86fb 100644 --- a/src/oaklib/interfaces/basic_ontology_interface.py +++ b/src/oaklib/interfaces/basic_ontology_interface.py @@ -637,7 +637,7 @@ def entities(self, filter_obsoletes=True, owl_type=None) -> Iterable[CURIE]: :param owl_type: CURIE for RDF metaclass for the object, e.g. owl:Class :return: iterator """ - raise NotImplementedError + raise NotImplementedError(f"{type(self)} does not implement entities()") @deprecated("Replaced by entities") def all_entity_curies(self, **kwargs) -> Iterable[CURIE]: diff --git a/src/oaklib/interfaces/class_enrichment_calculation_interface.py b/src/oaklib/interfaces/class_enrichment_calculation_interface.py index 0b814e113..62922154c 100644 --- a/src/oaklib/interfaces/class_enrichment_calculation_interface.py +++ b/src/oaklib/interfaces/class_enrichment_calculation_interface.py @@ -129,7 +129,7 @@ def enriched_classes( ) } - hypotheses = [x for x in hypotheses if bg_count[x] > 1] + hypotheses = [x for x in hypotheses if sample_count.get(x, 0) > 1] logging.info("Filtered hypotheses: {}".format(hypotheses)) num_hypotheses = len(hypotheses) diff --git a/src/oaklib/interfaces/obograph_interface.py b/src/oaklib/interfaces/obograph_interface.py index 05b67fbbc..bf5b855ae 100644 --- a/src/oaklib/interfaces/obograph_interface.py +++ b/src/oaklib/interfaces/obograph_interface.py @@ -309,7 +309,7 @@ def ancestors( :param start_curies: curie or curies to start the walk from :param predicates: only traverse over these (traverses over all if this is not set) :param reflexive: include self - :param method: + :param method: HOP or ENTAILMENT :return: all ancestor CURIEs """ if method and method == GraphTraversalMethod.ENTAILMENT: diff --git a/src/oaklib/interfaces/ontology_interface.py b/src/oaklib/interfaces/ontology_interface.py index 59c5d7922..29e96518e 100644 --- a/src/oaklib/interfaces/ontology_interface.py +++ b/src/oaklib/interfaces/ontology_interface.py @@ -14,6 +14,13 @@ class OntologyInterface: resource: OntologyResource = None strict: bool = False + @property + def implementation_name(self): + n = type(self).__name__ + if self.resource.slug: + n += "-" + self.resource.slug + return n + @classmethod def create(cls, resource: OntologyResource) -> "OntologyInterface": """ diff --git a/src/oaklib/interfaces/usages_interface.py b/src/oaklib/interfaces/usages_interface.py new file mode 100644 index 000000000..d6e704868 --- /dev/null +++ b/src/oaklib/interfaces/usages_interface.py @@ -0,0 +1,174 @@ +import logging +from abc import ABC +from enum import Enum +from typing import Iterable, List, Optional + +from pydantic import BaseModel + +from oaklib.datamodels.vocabulary import IS_A +from oaklib.interfaces import MappingProviderInterface, OboGraphInterface +from oaklib.interfaces.association_provider_interface import AssociationProviderInterface +from oaklib.interfaces.basic_ontology_interface import BasicOntologyInterface +from oaklib.types import CURIE + +logger = logging.getLogger(__name__) + + +class UsageContext(str, Enum): + """ + The context of a usage + """ + + RELATIONSHIP_SUBJECT = "relationship_subject" + RELATIONSHIP_OBJECT = "relationship_object" + RELATIONSHIP_PREDICATE = "relationship_predicate" + LOGICAL_DEFINITION_GENUS = "logical_definition_genus" + LOGICAL_DEFINITION_PREDICATE = "logical_definition_predicate" + LOGICAL_DEFINITION_FILLER = "logical_definition_filler" + ASSOCIATION_SUBJECT = "association_subject" + ASSOCIATION_PREDICATE = "association_predicate" + ASSOCIATION_OBJECT = "association_object" + MAPPING_SUBJECT = "mapping_subject" + MAPPING_OBJECT = "mapping_object" + + +class Usage(BaseModel): + """ + Represents a usage of a term + """ + + used_id: str + used_by_id: str + predicate: Optional[str] = None + source: str + context: UsageContext + description: Optional[str] = None + + +class UsagesInterface(BasicOntologyInterface, ABC): + """ + Allows querying for usages of terms + """ + + def usages( + self, + curies: List[CURIE], + used_by: Optional[List[CURIE]] = None, + used_by_prefixes: Optional[List[str]] = None, + **kwargs, + ) -> Iterable[Usage]: + """ + Get usages of a term + + :param curies: + :param kwargs: + :return: + """ + logger.info(f"Getting usages for {curies}, prefixes={used_by_prefixes}") + + if used_by or used_by_prefixes: + for usage in self.usages(curies, **kwargs): + ok = True + if used_by: + if usage.used_by_id not in used_by: + ok = False + if used_by_prefixes: + if not any(usage.used_by_id.startswith(p) for p in used_by_prefixes): + ok = False + if not ok: + logger.debug(f"Skipping {usage} as not in used_by") + if ok: + yield usage + return + + def _source_id(): + return self.implementation_name + + logger.info(f"Checking relationships subjects for {len(curies)} curies") + for s, p, o in self.relationships(curies): + yield Usage( + used_id=s, + used_by_id=o, + predicate=p, + source=_source_id(), + context=UsageContext.RELATIONSHIP_SUBJECT, + ) + logger.info(f"Checking relationships objects for {len(curies)} curies") + for s, p, o in self.relationships(objects=curies): + yield Usage( + used_id=o, + used_by_id=s, + predicate=p, + source=_source_id(), + context=UsageContext.RELATIONSHIP_OBJECT, + ) + logger.info(f"Checking relationships predicates for {len(curies)} curies") + for _s, p, o in self.relationships(predicates=curies): + # TODO: used_by is a relationship + yield Usage( + used_id=p, + used_by_id=o, + predicate=p, + source=_source_id(), + context=UsageContext.RELATIONSHIP_PREDICATE, + ) + logger.info(f"Checking logical definitions for {len(curies)} curies") + if isinstance(self, OboGraphInterface): + for ldef in self.logical_definitions(objects=curies): + for genus in ldef.genusIds: + if genus in curies: + yield Usage( + used_id=ldef.definedClassId, + used_by_id=genus, + predicate=IS_A, + source=_source_id(), + context=UsageContext.LOGICAL_DEFINITION_GENUS, + ) + for r in ldef.restrictions: + if r.propertyId in curies: + yield Usage( + used_id=ldef.definedClassId, + used_by_id=r.fillerId, + predicate=r.propertyId, + source=_source_id(), + context=UsageContext.LOGICAL_DEFINITION_PREDICATE, + ) + if r.fillerId in curies: + yield Usage( + used_id=ldef.definedClassId, + used_by_id=r.fillerId, + predicate=r.propertyId, + source=_source_id(), + context=UsageContext.LOGICAL_DEFINITION_FILLER, + ) + logger.info(f"Checking associations for {len(curies)} curies") + if isinstance(self, AssociationProviderInterface): + for a in self.associations(objects=curies, object_closure_predicates=[]): + yield Usage( + used_id=a.subject, + used_by_id=a.object, + predicate=a.predicate, + source=_source_id(), + context=UsageContext.ASSOCIATION_OBJECT, + ) + logger.info(f"Checking mappings for {len(curies)} curies") + if isinstance(self, MappingProviderInterface): + for m in self.sssom_mappings(curies): + if m.subject_id in curies: + yield Usage( + used_id=m.subject_id, + used_by_id=m.object_id, + predicate=m.predicate_id, + source=_source_id(), + context=UsageContext.MAPPING_SUBJECT, + ) + elif m.object_id in curies: + yield Usage( + used_id=m.object_id, + used_by_id=m.subject_id, + predicate=m.predicate_id, + source=_source_id(), + context=UsageContext.MAPPING_OBJECT, + ) + else: + raise AssertionError(f"Mapping {m} not in curies {curies}") diff --git a/src/oaklib/io/streaming_writer.py b/src/oaklib/io/streaming_writer.py index 4e79e1716..1d142b73e 100644 --- a/src/oaklib/io/streaming_writer.py +++ b/src/oaklib/io/streaming_writer.py @@ -7,6 +7,7 @@ from linkml_runtime import SchemaView from linkml_runtime.dumpers import json_dumper from linkml_runtime.utils.yamlutils import YAMLRoot +from pydantic import BaseModel from oaklib import BasicOntologyInterface from oaklib.datamodels.obograph import Node @@ -65,7 +66,9 @@ def output(self, value) -> None: else: self.file = self._output - def emit(self, obj: Union[YAMLRoot, dict, CURIE], label_fields: Optional[List[str]] = None): + def emit( + self, obj: Union[YAMLRoot, BaseModel, dict, CURIE], label_fields: Optional[List[str]] = None + ): """ Emit an object or CURIE @@ -77,6 +80,8 @@ def emit(self, obj: Union[YAMLRoot, dict, CURIE], label_fields: Optional[List[st self.emit_curie(obj) elif isinstance(obj, Node): self.emit_curie(obj.id) + elif isinstance(obj, BaseModel): + self.emit_obj(obj) elif isinstance(obj, dict): self.emit_curie(obj[ID_KEY], obj.get(LABEL_KEY, None)) else: @@ -100,7 +105,7 @@ def emit_multiple(self, entities: Iterable[CURIE], **kwargs): def emit_curie(self, curie: CURIE, label=None): raise NotImplementedError - def emit_obj(self, obj: YAMLRoot): + def emit_obj(self, obj: Union[YAMLRoot, BaseModel]): obj_as_dict = json_dumper.to_dict(obj) return self.emit(obj_as_dict) diff --git a/src/oaklib/io/streaming_yaml_writer.py b/src/oaklib/io/streaming_yaml_writer.py index 5b04ef4be..c142f84e5 100644 --- a/src/oaklib/io/streaming_yaml_writer.py +++ b/src/oaklib/io/streaming_yaml_writer.py @@ -5,6 +5,7 @@ from linkml_runtime import CurieNamespace from linkml_runtime.dumpers import json_dumper from linkml_runtime.utils.yamlutils import YAMLRoot +from pydantic import BaseModel from oaklib.io.streaming_writer import StreamingWriter @@ -27,7 +28,7 @@ class StreamingYamlWriter(StreamingWriter): def emit(self, obj: Union[YAMLRoot, dict], label_fields=None): if self.object_count: self.file.write("\n---\n") - if isinstance(obj, YAMLRoot): + if isinstance(obj, (YAMLRoot, BaseModel)): obj_as_dict = json_dumper.to_dict(obj) # self.file.write(yaml_dumper.dumps(obj)) elif isinstance(obj, dict): diff --git a/src/oaklib/query.py b/src/oaklib/query.py index f6df52bc2..a3303ca97 100644 --- a/src/oaklib/query.py +++ b/src/oaklib/query.py @@ -17,12 +17,25 @@ from oaklib.datamodels.search import create_search_configuration from oaklib.datamodels.vocabulary import ( DEVELOPS_FROM, + DISJOINT_WITH, + ENABLED_BY, + ENABLES, EQUIVALENT_CLASS, + HAS_DIRECT_INPUT, + HAS_INPUT, + HAS_OUTPUT, + HAS_PART, IS_A, + NEGATIVELY_REGULATES, + OCCURS_IN, OWL_CLASS, OWL_OBJECT_PROPERTY, PART_OF, + POSITIVELY_REGULATES, RDF_TYPE, + RDFS_DOMAIN, + RDFS_RANGE, + REGULATES, ) from oaklib.interfaces import ( OboGraphInterface, @@ -30,6 +43,7 @@ SearchInterface, SubsetterInterface, ) +from oaklib.interfaces.obograph_interface import GraphTraversalMethod from oaklib.interfaces.semsim_interface import SemanticSimilarityInterface from oaklib.types import CURIE, PRED_CURIE from oaklib.utilities.subsets.slimmer_utils import filter_redundant @@ -488,6 +502,7 @@ def chain_results(v): # queries can be nested using square brackets query_terms = nest_list_of_terms(query_terms) + logging.debug(f"Query terms: {query_terms}") while len(query_terms) > 0: # process each query term. A query term is either: @@ -518,16 +533,23 @@ def chain_results(v): lines = [line.strip() for line in file.readlines()] query_terms = lines + query_terms elif re.match(r"^([\w\-\.]+):(\S+)$", term): + logging.debug(f"CURIE: {term}") + if term.endswith(","): + logging.info(f"Removing trailing comma from {term}") + term = term[:-1] # CURIE chain_results(term) elif re.match(r"^http(\S+)$", term): + logging.debug(f"URI: {term}") # URI chain_results(term) elif re.match(r"^\.predicates=(\S*)$", term): + logging.debug(f"Predicates: {term}") logging.warning("Deprecated: pass as parameter instead") m = re.match(r"^\.predicates=(\S*)$", term) predicates = _process_predicates_arg(m.group(1)) elif term == ".and": + logging.debug("AND") # boolean term: consume the result of the query and intersect rest = list(query_terms_iterator(query_terms, adapter)) for x in results: @@ -535,6 +557,7 @@ def chain_results(v): yield x query_terms = [] elif term == ".xor": + logging.debug("XOR") # boolean term: consume the result of the query and xor rest = list(query_terms_iterator(query_terms, adapter)) remaining = [] @@ -548,6 +571,7 @@ def chain_results(v): yield x query_terms = [] elif term == ".not" or term == ".minus": + logging.debug("Minus") # boolean term: consume the result of the query and subtract rest = list(query_terms_iterator(query_terms, adapter)) for x in results: @@ -555,15 +579,20 @@ def chain_results(v): yield x query_terms = [] elif term == ".or": + logging.debug("OR") # or is implicit pass elif term.startswith(".all"): + logging.debug("All") chain_results(adapter.entities(filter_obsoletes=False)) elif term.startswith(".classes"): + logging.debug("Classes") chain_results(adapter.entities(owl_type=OWL_CLASS)) elif term.startswith(".relations"): + logging.debug("Relations") chain_results(adapter.entities(owl_type=OWL_OBJECT_PROPERTY)) elif term.startswith(".rand"): + logging.debug(f"Random: {term}") params = _parse_params(term) sample_size = params.get("n", "100") entities = list(adapter.entities()) @@ -572,22 +601,28 @@ def chain_results(v): ] chain_results(sample) elif term.startswith(".in"): + logging.debug(f"IN: {term}") # subset query subset = query_terms[0] query_terms = query_terms[1:] chain_results(adapter.subset_members(subset)) elif term.startswith(".is_obsolete"): + logging.debug("Obsolete") chain_results(adapter.obsoletes()) elif term.startswith(".non_obsolete"): + logging.debug("Non-obsolete") chain_results(adapter.entities(filter_obsoletes=True)) elif term.startswith(".dangling"): + logging.debug("Dangling") chain_results(adapter.dangling()) elif term.startswith(".filter"): + logging.debug(f"Filter: {term}") # arbitrary python expression expr = query_terms[0] query_terms = query_terms[1:] chain_results(eval(expr, {"impl": adapter, "terms": results})) # noqa elif term.startswith(".query"): + logging.debug(f"Query: {term}") # arbitrary SPARQL or SQL query (implementation specific) params = _parse_params(term) prefixes = params.get("prefixes", None) @@ -595,6 +630,7 @@ def chain_results(v): query_terms = query_terms[1:] chain_results([list(v.values())[0] for v in adapter.query(query, prefixes=prefixes)]) elif term.startswith(".desc"): + logging.debug(f"Descendants: {term}") # graph query: descendants params = _parse_params(term) this_predicates = params.get("predicates", predicates) @@ -605,6 +641,7 @@ def chain_results(v): else: raise NotImplementedError elif term.startswith(".sub"): + logging.debug(f"Subclasses: {term}") # graph query: is-a descendants rest = list(query_terms_iterator([query_terms[0]], adapter)) query_terms = query_terms[1:] @@ -613,6 +650,7 @@ def chain_results(v): else: raise NotImplementedError elif term.startswith(".child"): + logging.debug(f"Children: {term}") # graph query: children params = _parse_params(term) this_predicates = params.get("predicates", predicates) @@ -623,6 +661,7 @@ def chain_results(v): ] chain_results(children) elif term.startswith(".parent"): + logging.debug(f"Parents: {term}") # graph query: parents params = _parse_params(term) this_predicates = params.get("predicates", predicates) @@ -633,6 +672,7 @@ def chain_results(v): ] chain_results(parents) elif term.startswith(".sib"): + logging.debug(f"Siblings: {term}") # graph query: siblings params = _parse_params(term) this_predicates = params.get("predicates", predicates) @@ -647,16 +687,23 @@ def chain_results(v): ] chain_results(sibs) elif term.startswith(".anc"): + logging.debug(f"Anc: {term}") # graph query: ancestors params = _parse_params(term) this_predicates = params.get("predicates", predicates) + this_method = params.get("method", None) + if this_method is not None: + this_method = GraphTraversalMethod(this_method) rest = list(query_terms_iterator([query_terms[0]], adapter)) query_terms = query_terms[1:] if isinstance(adapter, OboGraphInterface): - chain_results(adapter.ancestors(rest, predicates=this_predicates)) + chain_results( + adapter.ancestors(rest, predicates=this_predicates, method=this_method) + ) else: raise NotImplementedError elif term.startswith(".mrca"): + logging.debug(f"MRCA: {term}") # graph query: most recent common ancestors params = _parse_params(term) this_predicates = params.get("predicates", predicates) @@ -669,6 +716,7 @@ def chain_results(v): else: raise NotImplementedError elif term.startswith(".nr"): + logging.debug(f"Non-redundant: {term}") # graph query: non-redundant params = _parse_params(term) this_predicates = params.get("predicates", predicates) @@ -676,6 +724,7 @@ def chain_results(v): query_terms = query_terms[1:] chain_results(filter_redundant(adapter, rest, this_predicates)) elif term.startswith(".gap_fill"): + logging.debug(f"Gap fill: {term}") if not isinstance(adapter, SubsetterInterface): raise NotImplementedError params = _parse_params(term) @@ -684,9 +733,13 @@ def chain_results(v): query_terms = query_terms[1:] chain_results(adapter.gap_fill_relationships(rest, predicates=this_predicates)) else: + logging.debug(f"Atomic term: {term}") # term is not query syntax: feed directly to search if not isinstance(adapter, SearchInterface): raise NotImplementedError(f"Search not implemented for {type(adapter)}") + if term.endswith(","): + logging.info(f"Removing trailing comma from {term}") + term = term[:-1] cfg = create_search_configuration(term) logging.info(f"Search config: {term} => {cfg}") chain_results(adapter.basic_search(cfg.search_terms[0], config=cfg)) @@ -773,7 +826,13 @@ def _process_predicates_arg( inputs = predicates_str.split(",") else: inputs = predicates_str.split("+") - preds = [_shorthand_to_pred_curie(p) for p in inputs] + preds = [] + for p in inputs: + next_preds = _shorthand_to_pred_curie(p) + if isinstance(next_preds, list): + preds.extend(next_preds) + else: + preds.append(next_preds) if exclude_predicates_str: if "," in exclude_predicates_str: exclude_inputs = exclude_predicates_str.split(",") @@ -791,16 +850,29 @@ def _process_predicates_arg( return preds -def _shorthand_to_pred_curie(shorthand: str) -> PRED_CURIE: +def _shorthand_to_pred_curie(shorthand: str) -> Union[PRED_CURIE, List[PRED_CURIE]]: + # TODO: replace with a transparent lookup table if shorthand == "i": return IS_A elif shorthand == "p": return PART_OF + elif shorthand == "h": + return HAS_PART + elif shorthand == "o": + return OCCURS_IN elif shorthand == "d": return DEVELOPS_FROM + elif shorthand == "en": + return [ENABLES, ENABLED_BY] + elif shorthand == "io": + return [HAS_INPUT, HAS_OUTPUT, HAS_DIRECT_INPUT] + elif shorthand == "r": + return [REGULATES, NEGATIVELY_REGULATES, POSITIVELY_REGULATES] elif shorthand == "t": return RDF_TYPE elif shorthand == "e": return EQUIVALENT_CLASS + elif shorthand == "owl": + return [IS_A, RDF_TYPE, EQUIVALENT_CLASS, DISJOINT_WITH, RDFS_DOMAIN, RDFS_RANGE] else: return shorthand diff --git a/src/oaklib/utilities/associations/association_index.py b/src/oaklib/utilities/associations/association_index.py index ef89e2386..31959aa84 100644 --- a/src/oaklib/utilities/associations/association_index.py +++ b/src/oaklib/utilities/associations/association_index.py @@ -81,10 +81,13 @@ def lookup( ) -> Iterator[Association]: session = self._session q = session.query(TermAssociation) + union_cutoff = 200 if property_filter: raise NotImplementedError if subjects: - q = q.filter(TermAssociation.subject.in_(tuple(subjects))) + subjects = list(subjects) + if len(subjects) < union_cutoff: + q = q.filter(TermAssociation.subject.in_(tuple(subjects))) if predicates: q = q.filter(TermAssociation.predicate.in_(tuple(predicates))) if objects: @@ -92,4 +95,7 @@ def lookup( logging.info(f"Association index lookup: {q}") for row in q: tup = (row.subject, row.predicate, row.object) + if subjects and len(subjects) < union_cutoff: + if row.subject not in subjects: + continue yield from self._associations_by_spo[tup] diff --git a/tests/test_implementations/test_ncbigene.py b/tests/test_implementations/test_ncbigene.py index 345404887..feb97c8e6 100644 --- a/tests/test_implementations/test_ncbigene.py +++ b/tests/test_implementations/test_ncbigene.py @@ -41,7 +41,7 @@ def test_parse_gene_xml(self): root = ElementTree.parse(str(GENE_PATH)).getroot() # noqa S314 if not isinstance(adapter, NCBIGeneImplementation): raise AssertionError - assocs = list(adapter.associations_from_xml("NCBIGene:1956", root)) + assocs = list(adapter._go_associations_from_xml("NCBIGene:1956", root)) self.assertGreater(len(assocs), 0) self.assertEqual(assocs[0].subject, "NCBIGene:1956") found = 0