Source code for pybliometrics.scopus.abstract_citation

from datetime import datetime
from hashlib import md5
from typing import NamedTuple
from warnings import warn

from pybliometrics.superclasses import Retrieval
from pybliometrics.utils import chained_get, check_parameter_value


class Author(NamedTuple):
    name: str | None
    surname: str | None
    initials: str | None
    id: str
    url: str | None



[docs]
class CitationOverview(Retrieval):
    @property
    def authors(self) -> list[list[Author] | None] | None:
        """A list of lists of namedtuples storing author information,
        where each namedtuple corresponds to one author and each sub-list to
        one document.
        The information in each namedtuple is `(name surname initials id url)`.
        All entries are strings.
        """
        outer = []
        for doc in self._citeInfoMatrix:
            inner = []
            for author in doc.get('author', []):
                author = {k.split(":", 1)[-1]: v for k, v in author.items()}
                new = Author(name=author.get('index-name'),
                             id=author['authid'],
                             surname=author.get('surname'),
                             initials=author.get('initials'),
                             url=author.get('author-url'))
                inner.append(new)
            outer.append(inner or None)
        return _maybe_return_list(outer)

    @property
    def cc(self) -> list[list[tuple[int, int]]]:
        """List of lists of tuples of yearly number of citations for specified
        years, where each sub-list corresponds to one document.
        """
        try:
            dates = self._date.split("-")
        except AttributeError:
            current_year = datetime.now().year
            dates = (current_year-2, current_year)
        _years = range(int(dates[0]), int(dates[1])+1)
        outer = []
        for doc in self._citeInfoMatrix:
            try:
                cites = [int(d['$']) for d in doc['cc']]
            except (AttributeError, KeyError, TypeError):  # No citations
                cites = [0]*len(_years)
            outer.append(list(zip(_years, cites)))
        return _maybe_return_list(outer)

    @property
    def citationType_long(self) -> list[str | None] | None:
        """Type (long version) of the documents (e.g. article, review)."""
        path = ["citationType", "$"]
        out = [chained_get(e, path) for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def citationType_short(self) -> list[str | None] | None:
        """Type (short version) of the documents (e.g. ar, re)."""
        path = ["citationType", "@code"]
        out = [chained_get(e, path) for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def columnTotal(self) -> list[int]:
        """The yearly number of citations for all documents combined."""
        return [int(d["$"]) for d in self._citeCountHeader["columnTotal"]]

    @property
    def doi(self) -> list[str | None] | None:
        """Document Object Identifier (DOI) of the documents."""
        out = [e.get('doi') for e in self._identifierlegend]
        return _maybe_return_list(out)

    @property
    def endingPage(self) -> list[str | None] | None:
        """Ending pages of the documents."""
        out = [e.get('endingPage') for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def grandTotal(self) -> int:
        """The total number of citations of all documents together."""
        return int(self._citeCountHeader["grandTotal"])

    @property
    def h_index(self) -> int:
        """Combined h-index of citations of all the documents."""
        return int(self._data['h-index'])

    @property
    def issn(self) -> list[str | tuple[str, str] | None] | None:
        """ISSN of the publishers of the documents.
        Note: If E-ISSN is known to Scopus, this returns both
        ISSN and E-ISSN in random order separated by blank space.
        """
        out = [e.get('issn') for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def issueIdentifier(self) -> list[str | None] | None:
        """Issue numbers of the documents."""
        out = [e.get('issueIdentifier') for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def laterColumnTotal(self) -> int:
        """The total number of citations for all years after the end
        year for all documents combined.
        """
        return int(self._citeCountHeader["laterColumnTotal"])

    @property
    def lcc(self) -> list[int]:
        """Number of citations after the end year of each document."""
        return [int(m['lcc']) for m in self._citeInfoMatrix]

    @property
    def pcc(self) -> list[int]:
        """Number of citations before the start year."""
        return [int(m['pcc']) for m in self._citeInfoMatrix]

    @property
    def pii(self) -> list[str | None] | None:
        """The Publication Item Identifier (PII) of the documents."""
        out = [e.get('pii') for e in self._identifierlegend]
        return _maybe_return_list(out)

    @property
    def prevColumnTotal(self) -> int:
        """The total number of citations for all years before the start
        year for all documents combined.
        """
        return int(self._citeCountHeader["prevColumnTotal"])

    @property
    def publicationName(self) -> list[str | None] | None:
        """Name of source the documents are published in (e.g. the Journal)."""
        out = [e.get('publicationName') for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def rangeColumnTotal(self) -> int:
        """The total number of citations for all specified years for all
        documents combined.
        """
        return int(self._citeCountHeader["rangeColumnTotal"])

    @property
    def rangeCount(self) -> list[int]:
        """Total citation count over the specified year range for
        each document.
        """
        return [int(e['rangeCount']) for e in self._citeInfoMatrix]

    @property
    def rowTotal(self) -> list[int]:
        """Total number of citations (specified and omitted years) for each
        document.
        """
        return [int(e['rowTotal']) for e in self._citeInfoMatrix]

    @property
    def scopus_id(self) -> list[int]:
        """The Scopus ID(s) of the documents.  Might differ from the
        ones provided.
        """
        return [int(e['scopus_id']) for e in self._identifierlegend]

    @property
    def startingPage(self) -> list[str | None] | None:
        """Starting page."""
        out = [e.get('startingPage') for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    @property
    def title(self) -> list[str]:
        """Titles of each document."""
        return [e["title"] for e in self._citeInfoMatrix]

    @property
    def url(self) -> list[str]:
        """URL(s) to Citation Overview API view of each document."""
        return [e["url"] for e in self._citeInfoMatrix]

    @property
    def volume(self) -> list[str | None] | None:
        """Volume for the abstract."""
        out = [e.get('volume') for e in self._citeInfoMatrix]
        return _maybe_return_list(out)

    def __init__(self,
                 identifier: list[int | str],
                 date: str | None = None,
                 start: int | str | None = None,
                 end: int | str | None = None,
                 id_type: str = "scopus_id",
                 refresh: bool | int = False,
                 citation: str | None = None,
                 **kwds: str
                 ) -> None:
        """Interaction with the Citation Overview API.

        :param identifier: Up to 25 identifiers of the same kind for which to look
                           up citations.  Must be Scopus IDs, DOIs, PIIs or
                           Pubmed IDs.
        :param date: Represents the year range for which the citations should be counted.
                     If `None`, Scopus returns data for the current and the previous
                     two years.
        :param start: (deprecated) The first year for which the citation count should
                      be loaded.
        :param end: (deprecated) The last year for which the citation count should be
                    loaded. Defaults to the current year.
        :param id_type: The type of the IDs provided in `identifier`.  Must be
                        one of `"scopus_id", "doi", "pii", "pubmed_id"`.
        :param refresh: Whether to refresh the cached file if it exists or not.
                        If int is passed, cached file will be refreshed if the
                        number of days since last modification exceeds that value.
        :param citation: Allows for the exclusion of self-citations or those
                         by books.  If `None`, will count all citations.
                         Allowed values: `None, exclude-self, exclude-books`
        :param kwds: Keywords passed on as query parameters.  Must contain
                     fields and values mentioned in the API specification at
                     https://dev.elsevier.com/documentation/AbstractCitationAPI.wadl.

        Raises
        -----
        ValueError
            If parameter `identifier` contains fewer than 1 or more than
            25 elements.

        ValueError
            If any of the parameters `citation`, `id_type` or `refresh` is not
            one of the allowed values.

        Notes
        -----
        The directory for cached results is `{path}/STANDARD/{id}-{citation}-{date}`,
        where `path` is specified in your configuration file, and `id` the
        md5-hashed version of a string joining `identifier` on underscore.

        Your API Key needs to be augmented by Elsevier's Scopus
        Integration Team to access this API.
        """
        # Checks
        allowed = ('scopus_id', 'doi', 'pii', 'pubmed_id')
        check_parameter_value(id_type, allowed, "id_type")
        if citation:
            allowed = ('exclude-self', 'exclude-books')
            check_parameter_value(citation, allowed, "citation")
        if len(identifier) < 0 or len(identifier) > 25:
            msg = "Provide at least 1 and at most than 25 identifiers"
            raise ValueError(msg)

        # Variables
        identifier = [str(i) for i in identifier]
        if start or end:
            msg = "Parameters `start` and `end` are deprecated and will be removed"\
                  f" in a future release.  Please use 'date={start}-{end}' instead."
            warn(msg, FutureWarning)
            if not date:
                date = f'{start}-{end}'
        self._date = date
        self._citation = citation
        self._refresh = refresh
        self._view = "STANDARD"

        # Get file content
        kwds.update({id_type: identifier})
        stem = md5("_".join(identifier).encode('utf8')).hexdigest()
        Retrieval.__init__(self, stem, date=date, citation=citation, **kwds)
        self._data = self._json['abstract-citations-response']

        # citeInfoMatrix
        matrix = self._data['citeInfoMatrix']['citeInfoMatrixXML']['citationMatrix']['citeInfo']
        self._citeInfoMatrix = [_parse_dict(e) for e in matrix]
        # identifier-legend
        identifier = self._data['identifier-legend']['identifier']
        self._identifierlegend = [_parse_dict(e) for e in identifier]
        # citeCountHeader
        self._citeCountHeader = self._data['citeColumnTotalXML']["citeCountHeader"]

    def __str__(self):
        """Return a summary string."""
        cits_dict = {'exclude-self': 'excluding self-citations',
                     'exclude-books': 'excluding citations from books'}
        date = self.get_cache_file_mdate().split()[0]
        cits_type = f'{cits_dict.get(self._citation, "")}'
        s = f"{len(self.scopus_id)} document(s) has/have the following "\
            f"total citation count{cits_type} as of {date}:\n    "\
            f"{'; '.join([str(n) for n in self.rowTotal])}"
        return s



def _parse_dict(dct):
    """Auxiliary function to change the keys of a dictionary."""
    return {k.split(":", 1)[-1]: v for k, v in dct.items()}


def _maybe_return_list(lst):
    """Return `lst` unless all of its elements are empty."""
    if all(e is None for e in lst):
        return None
    else:
        return lst