Source code for pybliometrics.scopus.scopus_search

from typing import NamedTuple

from pybliometrics.superclasses import Search
from pybliometrics.utils import check_integrity, check_parameter_value, \
    check_field_consistency, deduplicate, get_freetoread, html_unescape, \
    listify, make_search_summary, VIEWS


class Document(NamedTuple):
    eid: str | None
    doi: str | None
    pii: str | None
    pubmed_id: str | None
    title: str | None
    subtype: str | None
    subtypeDescription: str | None
    creator: str | None
    afid: str | None
    affilname: str | None
    affiliation_city: str | None
    affiliation_country: str | None
    author_count: str | None
    author_names: str | None
    author_ids: str | None
    author_afids: str | None
    coverDate: str | None
    coverDisplayDate: str | None
    publicationName: str | None
    issn: str | None
    source_id: str | None
    eIssn: str | None
    aggregationType: str | None
    volume: str | None
    issueIdentifier: str | None
    article_number: str | None
    pageRange: str | None
    description: str | None
    authkeywords: str | None
    citedby_count: int
    openaccess: int
    freetoread: str | None
    freetoreadLabel: str | None
    fund_acr: str | None
    fund_no: str | None
    fund_sponsor: str | None



[docs]
class ScopusSearch(Search):
    @property
    def results(self) -> list[Document] | None:
        """A list of namedtuples in the form `(eid doi pii pubmed_id title
        subtype subtypeDescription creator afid affilname affiliation_city
        affiliation_country author_count author_names author_ids author_afids
        coverDate coverDisplayDate publicationName issn source_id eIssn
        aggregationType volume issueIdentifier article_number pageRange
        description authkeywords citedby_count openaccess freetoread
        freetoreadLabel fund_acr fund_no fund_sponsor)`.
        Field definitions correspond to
        https://dev.elsevier.com/guides/ScopusSearchViews.htm and return the
        values as-is, except for `afid`, `affilname`, `affiliation_city`,
        `affiliation_country`, `author_names`, `author_ids` and `author_afids`:
        This information is joined on `";"`.  In case an author has multiple
        affiliations, they are joined on `"-"`
        (e.g. `Author1Aff;Author2Aff1-Author2Aff2`).

        Raises
        ------
        ValueError
            If the elements provided in `integrity_fields` do not match the
            actual field names (listed above).

        Notes
        -----
        The list of authors and the list of affiliations per author are
        deduplicated.

        The Scopus API returns only the first funding information.
        """
        # Initiate namedtuple with ordered list of fields
        fields = 'eid doi pii pubmed_id title subtype subtypeDescription ' \
                 'creator afid affilname affiliation_city ' \
                 'affiliation_country author_count author_names author_ids '\
                 'author_afids coverDate coverDisplayDate publicationName '\
                 'issn source_id eIssn aggregationType volume '\
                 'issueIdentifier article_number pageRange description '\
                 'authkeywords citedby_count openaccess freetoread '\
                 'freetoreadLabel fund_acr fund_no fund_sponsor'
        check_field_consistency(self._integrity, fields)
        # Parse elements one-by-one
        out = []
        for item in self._json:
            info = {}
            # Parse affiliations
            for field, key in [('affilname', 'affilname'),
                               ('afid', 'afid'),
                               ('aff_city', 'affiliation-city'),
                               ('aff_country', 'affiliation-country')]:
                info[field] = _join(item, key, unescape=self.unescape)
            # Parse authors
            try:
                # Deduplicate list of authors
                authors = deduplicate(item['author'])
                # Extract information
                surnames = _replace_none([d['surname'] for d in authors])
                firstnames = _replace_none([d['given-name'] for d in authors])
                info["auth_names"] = ";".join([", ".join([t[0], t[1]]) for t in
                                               zip(surnames, firstnames)])
                info["auth_ids"] = ";".join([d['authid'] for d in authors])
                affs = []
                for auth in authors:
                    aff = listify(deduplicate(auth.get('afid', [])))
                    affs.append('-'.join([d['$'] for d in aff]))
                if [a for a in affs if a]:
                    info["auth_afid"] = ';'.join(affs)
                else:
                    info["auth_afid"] = None
            except KeyError:
                pass
            date = item.get('prism:coverDate')
            if isinstance(date, list):
                date = date[0].get('$')
            freetoread = get_freetoread(item, ["freetoread", "value"])
            freetoreadLabel = get_freetoread(item, ["freetoreadLabel", "value"])
            # Get text fields and unescape
            for key in ['dc:title', 'dc:description', 'authkeywords']:
                value = item.get(key)
                info[key] = html_unescape(str(value)) if (self.unescape and value) else value
            fund_no = item.get('fund-no', '').replace("undefined", "") or None
            new = Document(article_number=item.get('article-number'),
                      title=info.get('dc:title'),
                      fund_no=fund_no,
                      fund_sponsor=item.get('fund-sponsor'),
                      subtype=item.get('subtype'), doi=item.get('prism:doi'),
                      subtypeDescription=item.get('subtypeDescription'),
                      issn=item.get('prism:issn'), creator=item.get('dc:creator'),
                      affilname=info.get("affilname"),
                      author_names=info.get("auth_names"),
                      coverDate=date, volume=item.get('prism:volume'),
                      coverDisplayDate=item.get('prism:coverDisplayDate'),
                      publicationName=item.get('prism:publicationName'),
                      source_id=item.get('source-id'), author_ids=info.get("auth_ids"),
                      aggregationType=item.get('prism:aggregationType'),
                      issueIdentifier=item.get('prism:issueIdentifier'),
                      pageRange=item.get('prism:pageRange'),
                      author_afids=info.get("auth_afid"),
                      affiliation_country=info.get("aff_country"),
                      citedby_count=int(item['citedby-count']),
                      openaccess=int(item['openaccess']),
                      freetoread=freetoread, freetoreadLabel=freetoreadLabel,
                      eIssn=item.get('prism:eIssn'),
                      author_count=item.get('author-count', {}).get('$'),
                      affiliation_city=info.get("aff_city"), afid=info.get("afid"),
                      description=info.get('dc:description'),
                      pii=item.get('pii'),
                      authkeywords=info.get('authkeywords'),
                      eid=item.get('eid'),
                      fund_acr=item.get('fund-acr'), pubmed_id=item.get('pubmed-id'))
            out.append(new)
        # Finalize
        check_integrity(out, self._integrity, self._action)
        return out or None

    def __init__(self,
                 query: str,
                 refresh: bool | int = False,
                 view: str = None,
                 verbose: bool = False,
                 download: bool = True,
                 integrity_fields: list[str] | tuple[str, ...] | None = None,
                 integrity_action: str = "raise",
                 subscriber: bool = True,
                 unescape: bool = True,
                 **kwds: str
                 ) -> None:
        """Interaction with the Scopus Search API.

        :param query: A string of the query as used in the Advanced Search
                     on scopus.com.  All fields except "INDEXTERMS()" and
                     "LIMIT-TO()" work.
        :param refresh: Whether to refresh the cached file if it exists or not.
                        If int is passed, cached file will be refreshed if the
                        number of days since last modification exceeds that value.
        :param view: Which view to use for the query, see
                     https://dev.elsevier.com/sc_search_views.html.
                     Allowed values: `STANDARD`, `COMPLETE`.  If `None`, defaults to
                     `COMPLETE` if `subscriber=True` and to `STANDARD` if
                     `subscriber=False`.
        :param verbose: Whether to print a download progress bar.
        :param download: Whether to download results (if they have not been
                         cached).
        :param integrity_fields: Names of fields whose completeness should
                                 be checked.  `ScopusSearch` will perform the
                                 action specified in `integrity_action` if
                                 elements in these fields are missing.  This
                                 helps to avoid idiosynchratically missing
                                 elements that should always be present
                                 (e.g., EID or source ID).
        :param integrity_action: What to do in case integrity of provided fields
                                 cannot be verified.  Possible actions:
                                 - `"raise"`: Raise an `AttributeError`
                                 - `"warn"`: Raise a `UserWarning`
        :param subscriber: Whether you access Scopus with a subscription or not.
                           For subscribers, Scopus's cursor navigation will be
                           used.  Sets the number of entries in each query
                           iteration to the maximum number allowed by the
                           corresponding view.
        :param unescape: Convert named and numeric characters in the `results` to
                         their corresponding Unicode characters.
        :param kwds: Keywords passed on as query parameters.  Must contain
                     fields and values mentioned in the API specification at
                     https://dev.elsevier.com/documentation/ScopusSearchAPI.wadl.

        Raises
        ------
        ScopusQueryError
            For non-subscribers, if the number of search results exceeds 5000.

        ValueError
            If any of the parameters `integrity_action`, `refresh` or `view`
            is not one of the allowed values.

        Notes
        -----
        The directory for cached results is `{path}/{view}/{fname}`,
        where `path` is specified in your configuration file and `fname` is
        the md5-hashed version of `query`.
        """
        # Checks
        if view:
            check_parameter_value(view, VIEWS['ScopusSearch'], "view")
        allowed = ("warn", "raise")
        check_parameter_value(integrity_action, allowed, "integrity_action")

        # Parameters
        if not view:
            if subscriber:
                view = "COMPLETE"
            else:
                view = "STANDARD"
        if "cursor" in kwds:
            subscriber = kwds["cursor"]
            kwds.pop("cursor")

        # Query
        self._action = integrity_action
        self._integrity = integrity_fields or []
        self._refresh = refresh
        self._query = query
        self._view = view
        Search.__init__(self, query=query,
                        cursor=subscriber, download=download,
                        verbose=verbose, **kwds)
        self.unescape = unescape

    def __str__(self):
        """Print a summary string."""
        return make_search_summary(self, "document", self.get_eids())


[docs]
    def get_eids(self):
        """EIDs of retrieved documents."""
        return [d['eid'] for d in self._json]




def _join(item, key, sep=";", unescape=False):
    """Auxiliary function to join same elements of a list of dictionaries if
    the elements are not None.
    """
    try:
        string = sep.join([d[key] or "" for d in item["affiliation"]])
        return html_unescape(string) if unescape else string
    except (KeyError, TypeError):
        return None


def _replace_none(lst, repl=""):
    """Auxiliary function to replace None's with another value."""
    return [repl if v is None else v for v in lst]