from typing import NamedTuple
from pybliometrics.superclasses import Search
from pybliometrics.utils import check_field_consistency, chained_get, \
check_integrity, check_parameter_value, deduplicate, \
make_search_summary, VIEWS
class Document(NamedTuple):
"""Named tuple representing a document from ScienceDirect Article Metadata API."""
authorKeywords: str | None
authors: str | None
available_online_date: str | None
first_author: str | None
abstract_text: str | None
doi: str | None
title: str | None
eid: str | None
link: str | None
openArchiveArticle: bool | None
openaccess_status: str | None
openaccessArticle: bool | None
openaccessUserLicense: str | None
pii: str | None
aggregationType: str | None
copyright: str | None
coverDate: str | None
coverDisplayDate: str | None
edition: str | None
endingPage: str | None
isbn: str | None
publicationName: str | None
startingPage: str | None
teaser: str | None
api_link: str | None
publicationType: str | None
vor_available_online_date: str | None
[docs]
class ArticleMetadata(Search):
@property
def results(self) -> list[Document] | None:
"""A list of namedtuples in the form `(authorKeywords authors available_online_date
first_author abstract_text doi title eid link openArchiveArticle openaccess_status
openaccessArticle openaccessUserLicense pii aggregationType copyright coverDate
coverDisplayDate edition endingPage isbn publicationName startingPage teaser
api_link publicationType vor_available_online_date)`.
Field definitions correspond to the `Article Metadata Views
<https://dev.elsevier.com/sd_article_meta_views.html>`__ and return the
values as-is, except for `authors` which are joined on `";"`.
Raises
------
ValueError
If the elements provided in `integrity_fields` do not match the
actual field names (listed above).
Notes
-----
The list of authors and the list of affiliations per author are
deduplicated.
"""
fields = 'authorKeywords authors available_online_date first_author abstract_text ' \
'doi title eid link openArchiveArticle openaccess_status openaccessArticle '\
'openaccessUserLicense pii aggregationType copyright coverDate coverDisplayDate '\
'edition endingPage isbn publicationName startingPage teaser api_link publicationType '\
'vor_available_online_date'
check_field_consistency(self._integrity, fields)
# Parse elements one-by-one
out = []
for item in self._json:
# Get authors and create ";" separated string
authors_list = [author.get('$') for author in chained_get(item, ['authors', 'author'], [])]
authors_list = deduplicate(authors_list)
authors = ';'.join(authors_list)
first_author = item.get('dc:creator')[0].get('$')
link = item.get('link')[0].get('@href')
doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None
new = Document(
authorKeywords=item.get('authkeywords'),
authors=authors,
available_online_date=item.get('available-online-date'),
first_author=first_author,
abstract_text=item.get('dc:description'),
doi=doi,
title=item.get('dc:title'),
eid=item.get('eid'),
link=link,
openArchiveArticle=item.get('openArchiveArticle'),
openaccess_status=item.get('openaccess'),
openaccessArticle=item.get('openaccessArticle'),
openaccessUserLicense=item.get('openaccessUserLicense'),
pii=item.get('pii'),
aggregationType=item.get('prism:aggregationType'),
copyright=item.get('prism:copyright'),
coverDate=item.get('prism:coverDate'),
coverDisplayDate=item.get('prism:coverDisplayDate'),
edition=item.get('prism:edition'),
endingPage=item.get('prism:endingPage'),
isbn=item.get('prism:isbn'),
publicationName=item.get('prism:publicationName'),
startingPage=item.get('prism:startingPage'),
teaser=item.get('prism:teaser'),
api_link=item.get('prism:url'),
publicationType=item.get('pubType'),
vor_available_online_date=item.get('vor-available-online-date'),
)
out.append(new)
check_integrity(out, self._integrity, self._action)
return out or None
def __init__(self,
query: str,
refresh: bool | int = False,
view: str | None = None,
verbose: bool = False,
download: bool = True,
integrity_fields: list[str] | tuple[str, ...] | None = None,
integrity_action: str = "raise",
subscriber: bool = True,
**kwds: str
) -> None:
"""Interaction with the ScienceDirect Article Metadata API.
:param query: A string of the query as used in the `Advanced Search <https://dev.elsevier.com/tecdoc_sdsearch_migration.html>`__.
:param refresh: Whether to refresh the cached file if it exists or not.
If int is passed, cached file will be refreshed if the
number of days since last modification exceeds that value.
:param view: Which view to use for the query, see `the documentation <https://dev.elsevier.com/sd_article_meta_views.html>`__.
Allowed values: `STANDARD`, `COMPLETE`. If `None`, defaults to
`COMPLETE` if `subscriber=True` and to `STANDARD` if
`subscriber=False`.
:param verbose: Whether to print a download progress bar.
:param download: Whether to download results (if they have not been
cached).
:param integrity_fields: A list or tuple with the names of fields whose completeness should
be checked. `ArticleMetadata` will perform the
action specified in `integrity_action` if
elements in these fields are missing. This
helps to avoid idiosynchratically missing
elements that should always be present
(e.g., EID or source ID).
:param integrity_action: What to do in case integrity of provided fields
cannot be verified. Possible actions:
- `"raise"`: Raise an `AttributeError`
- `"warn"`: Raise a `UserWarning`
:param subscriber: Whether you access ScienceDirect with a subscription or not.
For subscribers, ScienceDirect's cursor navigation will be
used. Sets the number of entries in each query
iteration to the maximum number allowed by the
corresponding view.
:param unescape: Convert named and numeric characters in the `results` to
their corresponding Unicode characters.
:param kwds: Keywords passed on as query parameters. Must contain
fields and values mentioned in the `API specification <https://dev.elsevier.com/documentation/ArticleMetadataAPI.wadl>`__.
Raises
------
ScopusQueryError
For non-subscribers, if the number of search results exceeds 5000.
ValueError
If any of the parameters `integrity_action`, `refresh` or `view`
is not one of the allowed values.
Notes
-----
The directory for cached results is `{path}/{view}/{fname}`,
where `path` is specified in your configuration file and `fname` is
the md5-hashed version of `query`.
"""
# Check view or set to default
if view:
check_parameter_value(view, VIEWS['ArticleMetadata'], "view")
else:
view = "COMPLETE" if subscriber else "STANDARD"
allowed = ("warn", "raise")
check_parameter_value(integrity_action, allowed, "integrity_action")
# Query
self._action = integrity_action
self._integrity = integrity_fields or []
self._refresh = refresh
self._query = query
self._view = view
Search.__init__(self, query=query, download=download, verbose=verbose, **kwds)
def __str__(self):
"""Print a summary string."""
return make_search_summary(self, "document", self.get_eids())
[docs]
def get_eids(self):
"""EIDs of retrieved documents."""
return [d['eid'] for d in self._json]