Source code for pybliometrics.sciencedirect.article_metadata

from typing import NamedTuple

from pybliometrics.superclasses import Search
from pybliometrics.utils import check_field_consistency, chained_get, \
    check_integrity, check_parameter_value, deduplicate, \
    make_search_summary, VIEWS


class Document(NamedTuple):
    """Named tuple representing a document from ScienceDirect Article Metadata API."""
    authorKeywords: str | None
    authors: str | None
    available_online_date: str | None
    first_author: str | None
    abstract_text: str | None
    doi: str | None
    title: str | None
    eid: str | None
    link: str | None
    openArchiveArticle: bool | None
    openaccess_status: str | None
    openaccessArticle: bool | None
    openaccessUserLicense: str | None
    pii: str | None
    aggregationType: str | None
    copyright: str | None
    coverDate: str | None
    coverDisplayDate: str | None
    edition: str | None
    endingPage: str | None
    isbn: str | None
    publicationName: str | None
    startingPage: str | None
    teaser: str | None
    api_link: str | None
    publicationType: str | None
    vor_available_online_date: str | None



[docs]
class ArticleMetadata(Search):
    @property
    def results(self) -> list[Document] | None:
        """A list of namedtuples in the form `(authorKeywords authors available_online_date
        first_author abstract_text doi title eid link openArchiveArticle openaccess_status
        openaccessArticle openaccessUserLicense pii aggregationType copyright coverDate
        coverDisplayDate edition endingPage isbn publicationName startingPage teaser
        api_link publicationType vor_available_online_date)`.

        Field definitions correspond to the `Article Metadata Views
        <https://dev.elsevier.com/sd_article_meta_views.html>`__ and return the
        values as-is, except for `authors` which are joined on `";"`.

        Raises
        ------
        ValueError
            If the elements provided in `integrity_fields` do not match the
            actual field names (listed above).

        Notes
        -----
        The list of authors and the list of affiliations per author are
        deduplicated.
        """
        fields = 'authorKeywords authors available_online_date first_author abstract_text ' \
            'doi title eid link openArchiveArticle openaccess_status openaccessArticle '\
            'openaccessUserLicense pii aggregationType copyright coverDate coverDisplayDate '\
            'edition endingPage isbn publicationName startingPage teaser api_link publicationType '\
            'vor_available_online_date'
        check_field_consistency(self._integrity, fields)
        # Parse elements one-by-one
        out = []
        for item in self._json:
            # Get authors and create ";" separated string
            authors_list = [author.get('$') for author in chained_get(item, ['authors', 'author'], [])]
            authors_list = deduplicate(authors_list)
            authors = ';'.join(authors_list)
            first_author = item.get('dc:creator')[0].get('$')
            link = item.get('link')[0].get('@href')
            doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None
            new = Document(
                authorKeywords=item.get('authkeywords'),
                authors=authors,
                available_online_date=item.get('available-online-date'),
                first_author=first_author,
                abstract_text=item.get('dc:description'),
                doi=doi,
                title=item.get('dc:title'),
                eid=item.get('eid'),
                link=link,
                openArchiveArticle=item.get('openArchiveArticle'),
                openaccess_status=item.get('openaccess'),
                openaccessArticle=item.get('openaccessArticle'),
                openaccessUserLicense=item.get('openaccessUserLicense'),
                pii=item.get('pii'),
                aggregationType=item.get('prism:aggregationType'),
                copyright=item.get('prism:copyright'),
                coverDate=item.get('prism:coverDate'),
                coverDisplayDate=item.get('prism:coverDisplayDate'),
                edition=item.get('prism:edition'),
                endingPage=item.get('prism:endingPage'),
                isbn=item.get('prism:isbn'),
                publicationName=item.get('prism:publicationName'),
                startingPage=item.get('prism:startingPage'),
                teaser=item.get('prism:teaser'),
                api_link=item.get('prism:url'),
                publicationType=item.get('pubType'),
                vor_available_online_date=item.get('vor-available-online-date'),
            )
            out.append(new)
        check_integrity(out, self._integrity, self._action)
        return out or None

    def __init__(self,
                 query: str,
                 refresh: bool | int = False,
                 view: str | None = None,
                 verbose: bool = False,
                 download: bool = True,
                 integrity_fields: list[str] | tuple[str, ...] | None = None,
                 integrity_action: str = "raise",
                 subscriber: bool = True,
                 **kwds: str
                 ) -> None:
        """Interaction with the ScienceDirect Article Metadata API.

        :param query: A string of the query as used in the `Advanced Search <https://dev.elsevier.com/tecdoc_sdsearch_migration.html>`__.
        :param refresh: Whether to refresh the cached file if it exists or not.
                        If int is passed, cached file will be refreshed if the
                        number of days since last modification exceeds that value.
        :param view: Which view to use for the query, see `the documentation <https://dev.elsevier.com/sd_article_meta_views.html>`__.
                     Allowed values: `STANDARD`, `COMPLETE`.  If `None`, defaults to
                     `COMPLETE` if `subscriber=True` and to `STANDARD` if
                     `subscriber=False`.
        :param verbose: Whether to print a download progress bar.
        :param download: Whether to download results (if they have not been
                         cached).
        :param integrity_fields: A list or tuple with the names of fields whose completeness should
                                 be checked.  `ArticleMetadata` will perform the
                                 action specified in `integrity_action` if
                                 elements in these fields are missing.  This
                                 helps to avoid idiosynchratically missing
                                 elements that should always be present
                                 (e.g., EID or source ID).
        :param integrity_action: What to do in case integrity of provided fields
                                 cannot be verified.  Possible actions:
                                 - `"raise"`: Raise an `AttributeError`
                                 - `"warn"`: Raise a `UserWarning`
        :param subscriber: Whether you access ScienceDirect with a subscription or not.
                           For subscribers, ScienceDirect's cursor navigation will be
                           used.  Sets the number of entries in each query
                           iteration to the maximum number allowed by the
                           corresponding view.
        :param unescape: Convert named and numeric characters in the `results` to
                        their corresponding Unicode characters.
        :param kwds: Keywords passed on as query parameters.  Must contain
                     fields and values mentioned in the `API specification <https://dev.elsevier.com/documentation/ArticleMetadataAPI.wadl>`__.

        Raises
        ------
        ScopusQueryError
            For non-subscribers, if the number of search results exceeds 5000.

        ValueError
            If any of the parameters `integrity_action`, `refresh` or `view`
            is not one of the allowed values.

        Notes
        -----
        The directory for cached results is `{path}/{view}/{fname}`,
        where `path` is specified in your configuration file and `fname` is
        the md5-hashed version of `query`.
        """
        # Check view or set to default
        if view:
            check_parameter_value(view, VIEWS['ArticleMetadata'], "view")
        else:
            view = "COMPLETE" if subscriber else "STANDARD"

        allowed = ("warn", "raise")
        check_parameter_value(integrity_action, allowed, "integrity_action")

        # Query
        self._action = integrity_action
        self._integrity = integrity_fields or []
        self._refresh = refresh
        self._query = query
        self._view = view
        Search.__init__(self, query=query, download=download, verbose=verbose, **kwds)

    def __str__(self):
        """Print a summary string."""
        return make_search_summary(self, "document", self.get_eids())


[docs]
    def get_eids(self):
        """EIDs of retrieved documents."""
        return [d['eid'] for d in self._json]