Source code for eutils._internal.queryservice

# -*- coding: utf-8 -*-

"""provide cached and throttled querying of `NCBI E-utilities
<http://www.ncbi.nlm.nih.gov/books/NBK25499/>`_.

QueryService defaults to returning XML documents only. This behavior
may be controlled upon instantiation by setting default_args.

::

    # create an instance of QueryService
    >> qs = QueryService()

    # get xml for database info (in this case, a list of available database)
    >> result = qs.einfo()

    # execute a search using an NCBI query against the gene database
    >> result = qs.esearch({'db': 'gene', 'term': 'VEGF AND human[organism]'})

    # get xml doc for gene id=7157
    >> result = qs.efetch({'db': 'gene', 'id': 7157})

"""

from __future__ import absolute_import, division, print_function, unicode_literals

import hashlib
import logging
import os
import time

import lxml.etree
import requests

from .sqlitecache import SQLiteCache
from .exceptions import EutilsRequestError, EutilsNCBIError
from .compat import pickle


_logger = logging.getLogger(__name__)

url_base = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils'
default_default_args = {'retmode': 'xml', 'usehistory': 'y', 'retmax': 250}
default_tool = __package__
default_email = 'biocommons-dev@googlegroups.com'
default_cache_path = os.path.join(os.path.expanduser('~'), '.cache', 'eutils-cache.db')


[docs]class QueryService(object):

    """*provides throttled and cached querying of NCBI E-utilities services*

    QueryService has three functions:

    * construct URLs appropriate for eutils endpoints

    * throttle queries per NCBI guidelines

    * cache results in persistent cache (sqlite)

    QueryService works with any valid query arguments, passed as
    dictionaries.

    Implemented interfaces:

        * esearch
        * efetch
        * elink
        * einfo
        * esummary

    Implementing other query modes should be straightforward.

    See also the NCBI's Entrez Programming Utilities Help: 
        http://www.ncbi.nlm.nih.gov/books/NBK25500/

    """

    def __init__(self,
                 email=default_email,
                 cache=False,
                 default_args=default_default_args,
                 request_interval=None,
                 tool=default_tool,
                 api_key=None
                 ):
        """
        :param str email: email of user (for abuse reports)
        :param str cache: if True, cache at ~/.cache/eutils-db.sqlite; if string, cache there; if False, don't cache
        :param dict default_args: dictionary of query args that should accompany all requests
        :param request_interval: seconds between requests; default: auto-select based on API key
        :type request_interval: int or a callable returning an int
        :param str api_key: api key assigned by NCBI
        :param str tool: name of client
        :rtype: None
        :raises OSError: if sqlite file can't be opened

        """

        self.default_args = default_args
        self.email = email
        self.tool = tool
        self.api_key = api_key

        if request_interval is not None:
            _logger.warning("eutils QueryService: request_interval no longer supported; ignoring passed parameter")

        if self.api_key is None:
            requests_per_second = 3
            _logger.warning("No NCBI API key provided; throttling to {} requests/second; see "
                         "https://ncbiinsights.ncbi.nlm.nih.gov/2017/11/02/new-api-keys-for-the-e-utilities/".format(
                             requests_per_second))
        else:
            requests_per_second = 10
            _logger.info("Using NCBI API key; throttling to {} requests/second".format(requests_per_second))

        self.request_interval = 1.0 / requests_per_second

        self._last_request_clock = 0
        self._ident_args = {'tool': tool, 'email': email}
        self._request_count = 0

        if cache is True:
            cache_path = default_cache_path
        elif cache:
            cache_path = cache  # better act like a path string
        else:
            cache_path = False
        self._cache = SQLiteCache(cache_path) if cache_path else None


[docs]    def efetch(self, args):
        """
        execute a cached, throttled efetch query

        :param dict args: dict of query items
        :returns: content of reply
        :rtype: str
        :raises EutilsRequestError: when NCBI replies, but the request failed (e.g., bogus database name)

        """
        return self._query('/efetch.fcgi', args)

[docs]    def einfo(self, args=None):
        """
        execute a NON-cached, throttled einfo query

        einfo.fcgi?db=<database>

        Input: Entrez database (&db) or None (returns info on all Entrez databases)

        Output: XML containing database statistics

        Example: Find database statistics for Entrez Protein.

            QueryService.einfo({'db': 'protein'})

        Equivalent HTTP request:

            https://eutils.ncbi.nlm.nih.gov/entrez/eutils/einfo.fcgi?db=protein

        :param dict args: dict of query items (optional)
        :returns: content of reply
        :rtype: str
        :raises EutilsRequestError: when NCBI replies, but the request failed (e.g., bogus database name)

        """
        if args is None:
            args = {}
        return self._query('/einfo.fcgi', args, skip_cache=True)

[docs]    def esearch(self, args):
        """
        execute a cached, throttled esearch query

        :param dict args: dict of query items, containing at least 'db' and 'term' keys
        :returns: content of reply
        :rtype: str
        :raises EutilsRequestError: when NCBI replies, but the request failed (e.g., bogus database name)

        """
        return self._query('/esearch.fcgi', args)

[docs]    def elink(self, args):
        """
        execute a cached, throttled elink query

        Input: List of UIDs (&id); Source Entrez database (&dbfrom); Destination Entrez database (&db)

        Output: XML containing linked UIDs from source and destination databases

        Example: Find one set of Gene IDs linked to nuccore GIs 34577062 and 24475906

            QueryService.elink({'dbfrom': 'nuccore', 'db': 'gene', 'id': '34577062,24475906'})

        Equivalent HTTP request:

            https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi?dbfrom=nuccore&db=gene&id=34577062,24475906

        :param dict args: dict of query items containing at least the 'db', 'dbfrom', and 'id' keys.
        :returns: content of reply
        :rtype: str
        :raises EutilsRequestError: when NCBI replies, but the request failed (e.g., bogus database name)

        """
        return self._query('/elink.fcgi', args)

[docs]    def esummary(self, args):
        """
        execute a cached, throttled esummary query

        Input: List of UIDs (&id); Entrez database (&db)

        Output: XML document summary for requested ID(s) [comma-separated]

        Example: 
        
            QueryService.esummary({ 'db': 'medgen', 'id': 134 })

        Equivalent HTTP request:

            https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi?db=medgen&id=134

        :param dict args: dict of query items containing at least 'db' and 'id' keys.
        :returns: content of reply
        :rtype: str
        :raises EutilsRequestError: when NCBI replies, but the request failed (e.g., bogus database name)

        """
        return self._query('/esummary.fcgi', args)


    ############################################################################
    ## Internals
    def _query(self, path, args=None, skip_cache=False, skip_sleep=False):
        """return results for a NCBI query, possibly from the cache

        :param: path: relative query path (e.g., 'einfo.fcgi')
        :param: args: dictionary of query args
        :param: skip_cache: whether to bypass the cache on reading
        :param: skip_sleep: whether to bypass query throttling
        :rtype: xml string

        The args are joined with args required by NCBI (tool and email
        address) and with the default args declared when instantiating
        the client.
        """
        if args is None:
            args = {}        
        def _cacheable(r):
            """return False if r shouldn't be cached (contains a no-cache meta
            line); True otherwise"""
            return not ("no-cache" in r  # obviate parsing, maybe
                        and lxml.etree.XML(r).xpath("//meta/@content='no-cache'"))
        
        # cache key: the key associated with this endpoint and args The
        # key intentionally excludes the identifying args (tool and email)
        # and is independent of the request method (GET/POST) args are
        # sorted for canonicalization

        url = url_base + path

        # next 3 lines converted by 2to3 -nm
        defining_args = dict(list(self.default_args.items()) + list(args.items()))
        full_args = dict(list(self._ident_args.items()) + list(defining_args.items()))
        cache_key = hashlib.md5(pickle.dumps((url, sorted(defining_args.items())))).hexdigest()

        sqas = ';'.join([k + '=' + str(v) for k, v in sorted(args.items())])
        full_args_str = ';'.join([k + '=' + str(v) for k, v in sorted(full_args.items())])

        logging.debug("CACHE:" + str(skip_cache) + "//" + str(self._cache))
        if not skip_cache and self._cache:
            try:
                v = self._cache[cache_key]
                _logger.debug('cache hit for key {cache_key} ({url}, {sqas}) '.format(
                    cache_key=cache_key,
                    url=url,
                    sqas=sqas))
                return v
            except KeyError:
                _logger.debug('cache miss for key {cache_key} ({url}, {sqas}) '.format(
                    cache_key=cache_key,
                    url=url,
                    sqas=sqas))
                pass

        if self.api_key:
            url += '?api_key={self.api_key}'.format(self=self)

        # --

        if not skip_sleep:
            req_int = self.request_interval
            sleep_time = req_int - (time.clock() - self._last_request_clock)
            if sleep_time > 0:
                _logger.debug('sleeping {sleep_time:.3f}'.format(sleep_time=sleep_time))
                time.sleep(sleep_time)

        r = requests.post(url, full_args)
        self._last_request_clock = time.clock()
        _logger.debug('post({url}, {fas}): {r.status_code} {r.reason}, {len})'.format(
            url=url,
            fas=full_args_str,
            r=r,
            len=len(r.text)))

        if not r.ok:
            # TODO: discriminate between types of errors
            if r.headers["Content-Type"] == "application/json":
                json = r.json()
                raise EutilsRequestError('{r.reason} ({r.status_code}): {error}'.format(r=r, error=json["error"]))
            try:
                xml = lxml.etree.fromstring(r.text.encode('utf-8'))
                raise EutilsRequestError('{r.reason} ({r.status_code}): {error}'.format(r=r, error=xml.find('ERROR').text))
            except Exception as ex:
                raise EutilsNCBIError('Error parsing response object from NCBI: {}'.format(ex))

        if any(bad_word in r.text for bad_word in ['<error>', '<ERROR>']):
            if r.text is not None:
                try:
                    xml = lxml.etree.fromstring(r.text.encode('utf-8'))
                    raise EutilsRequestError('{r.reason} ({r.status_code}): {error}'.format(r=r, error=xml.find('ERROR').text))
                except Exception as ex:
                    raise EutilsNCBIError('Error parsing response object from NCBI: {}'.format(ex))

        if '<h1 class="error">Access Denied</h1>' in r.text:
            raise EutilsRequestError('Access Denied: {url}'.format(url=url))

        if self._cache and _cacheable(r.text):
            # N.B. we cache results even when skip_cache (read) is true
            self._cache[cache_key] = r.content
            _logger.info('cached results for key {cache_key} ({url}, {sqas}) '.format(
                cache_key=cache_key,
                url=url,
                sqas=sqas))

        return r.content


if __name__ == '__main__':
    logging.basicConfig(level=logging.DEBUG)
    qs = QueryService()
    r = qs.einfo({'db': 'protein'})
    r = qs.efetch({'db': 'protein', 'id': '319655736'})


# <LICENSE>
# Copyright 2015 eutils Committers
# 
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
# 
# http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
# or implied. See the License for the specific language governing
# permissions and limitations under the License.
# </LICENSE>