Source code for pybliometrics.sciencedirect.sciencedirect_search

from typing import NamedTuple

from pybliometrics.superclasses import Search
from pybliometrics.utils import check_field_consistency, chained_get, \
    check_integrity, check_parameter_value, deduplicate, \
    make_search_summary, VIEWS


class Document(NamedTuple):
    """Named tuple representing a document from ScienceDirect Search API."""
    authors: str | None
    first_author: str | None
    doi: str | None
    title: str | None
    link: str | None
    load_date: str | None
    openaccess_status: bool | None
    pii: str | None
    coverDate: str | None
    endingPage: str | None
    publicationName: str | None
    startingPage: str | None
    api_link: str | None
    volume: str | None



[docs]
class ScienceDirectSearch(Search):
    @property
    def results(self) -> list[Document] | None:
        """A list of namedtuples in the form `(authors first_author doi title link
        load_date openaccess_status pii coverDate endingPage publicationName startingPage
        api_link volume)`.

        Field definitions correspond to the `ScienceDirect Search Views
        <https://dev.elsevier.com/sd_search_views.htmll>`__ and return the
        values as-is, except for `authors` which are joined on `";"`.

        Raises
        ------
        ValueError
            If the elements provided in `integrity_fields` do not match the
            actual field names (listed above).

        Notes
        -----
        The list of authors and the list of affiliations per author are
        deduplicated.
        """
        fields = 'authors first_author doi title link load_date openaccess_status pii '\
            'coverDate endingPage publicationName startingPage api_link volume'
        check_field_consistency(self._integrity, fields)
        # Parse elements one-by-one
        out = []
        for item in self._json:
            # Get authors and create ";" separated string
            authors_list = self._get_authors(item)
            authors_list = deduplicate(authors_list)
            authors = ';'.join(authors_list)
            # Get links
            links_found = item.get('link')
            links = {'api_link': None, 'scidir': None}
            for link in links_found:
                if link.get('@ref') == 'self':
                    links['api_link'] = link.get('@href')
                elif link.get('@ref') == 'scidir':
                    links['scidir'] = link.get('@href')
            # Get doi
            doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None
            new = Document(
                authors=authors,
                first_author=item.get('dc:creator'),
                doi=doi,
                title=item.get("dc:title"),
                link=links["scidir"],
                load_date=item.get("load-date"),
                openaccess_status=item.get("openaccess"),
                pii=item.get("pii"),
                coverDate=item.get("prism:coverDate"),
                endingPage=item.get("prism:endingPage"),
                publicationName=item.get("prism:publicationName"),
                startingPage=item.get("prism:startingPage"),
                api_link=links["api_link"] or item.get("prism:url"),
                volume=item.get("prism:volume")
            )
            out.append(new)
        check_integrity(out, self._integrity, self._action)
        return out or None

    def __init__(self,
                 query: str,
                 refresh: bool | int = False,
                 view: str | None = None,
                 verbose: bool = False,
                 download: bool = True,
                 integrity_fields: list[str] | tuple[str, ...] | None = None,
                 integrity_action: str = "raise",
                 subscriber: bool = True,
                 **kwds: str
                 ) -> None:
        """Interaction with the ScienceDirect Search API. This represents a search against the
        ScienceDirect cluster, which contains serial/nonserial full-text articles. Note that this API
        replicates the search experience on `ScienceDirect <www.sciencedirect.com>`__.

        :param query: A string of the query as used in the `ScienceDirect Search <https://dev.elsevier.com/tecdoc_sdsearch_migration.html>`__.
        :param refresh: Whether to refresh the cached file if it exists or not.
                        If int is passed, cached file will be refreshed if the
                        number of days since last modification exceeds that value.
        :param view: Which view to use for the query, see `the documentation <https://dev.elsevier.com/sd_search_views.html>`__.
                     Allowed values: `STANDARD`.
        :param verbose: Whether to print a download progress bar.
        :param download: Whether to download results (if they have not been
                         cached).
        :param integrity_fields: A list or tuple with the names of fields whose completeness should
                                 be checked.  `ArticleMetadata` will perform the
                                 action specified in `integrity_action` if
                                 elements in these fields are missing.  This
                                 helps to avoid idiosynchratically missing
                                 elements that should always be present
                                 (e.g., doi or authors).
        :param integrity_action: What to do in case integrity of provided fields
                                 cannot be verified.  Possible actions:
                                 - `"raise"`: Raise an `AttributeError`
                                 - `"warn"`: Raise a `UserWarning`
        :param subscriber: Whether you access ScienceDirect with a subscription or not.
                           For subscribers, ScienceDirect's cursor navigation will be
                           used.  Sets the number of entries in each query
                           iteration to the maximum number allowed by the
                           corresponding view.
        :param kwds: Keywords passed on as query parameters.  Must contain
                     fields and values mentioned in the `API specification <https://dev.elsevier.com/documentation/ArticleMetadataAPI.wadl>`__.

        Raises
        ------
        ScopusQueryError
            For non-subscribers, if the number of search results exceeds 5000.

        ValueError
            If any of the parameters `integrity_action`, `refresh` or `view`
            is not one of the allowed values.

        Notes
        -----
        The directory for cached results is `{path}/{view}/{fname}`,
        where `path` is specified in your configuration file and `fname` is
        the md5-hashed version of `query`.

        The ScienceDirect Search API V2 has two available interfaces: `PUT` and `GET`. This library uses the
        `GET` interface.
        """
        # Check view or set to default
        if view:
            check_parameter_value(view, VIEWS['ScienceDirectSearch'], "view")
        else:
            view = "STANDARD"

        allowed = ("warn", "raise")
        check_parameter_value(integrity_action, allowed, "integrity_action")

        # Query
        self._action = integrity_action
        self._integrity = integrity_fields or []
        self._refresh = refresh
        self._query = query
        self._view = view
        Search.__init__(self, query=query, download=download, verbose=verbose, **kwds)

    def __str__(self):
        """Print a summary string."""
        return make_search_summary(self, "document", self.get_dois())


[docs]
    def get_dois(self):
        """DOIs of retrieved documents."""
        return [d.get("prism:doi") or d.get("dc:identifier")[4:] if d.get("dc:identifier") else None for d in self._json]


    def _get_authors(self, item: dict) -> list:
        """Auxiliary function to get the authors."""
        authors_data = chained_get(item, ['authors', 'author'], [])
        if isinstance(authors_data, list):
            authors_list = [a.get('$') for a in authors_data]
        elif isinstance(authors_data, str):
            authors_list = [authors_data]
        else:
            authors_list = []
        return authors_list