from typing import NamedTuple
from pybliometrics.superclasses import Search
from pybliometrics.utils import check_field_consistency, chained_get, \
check_integrity, check_parameter_value, deduplicate, \
make_search_summary, VIEWS
class Document(NamedTuple):
"""Named tuple representing a document from ScienceDirect Search API."""
authors: str | None
first_author: str | None
doi: str | None
title: str | None
link: str | None
load_date: str | None
openaccess_status: bool | None
pii: str | None
coverDate: str | None
endingPage: str | None
publicationName: str | None
startingPage: str | None
api_link: str | None
volume: str | None
[docs]
class ScienceDirectSearch(Search):
@property
def results(self) -> list[Document] | None:
"""A list of namedtuples in the form `(authors first_author doi title link
load_date openaccess_status pii coverDate endingPage publicationName startingPage
api_link volume)`.
Field definitions correspond to the `ScienceDirect Search Views
<https://dev.elsevier.com/sd_search_views.htmll>`__ and return the
values as-is, except for `authors` which are joined on `";"`.
Raises
------
ValueError
If the elements provided in `integrity_fields` do not match the
actual field names (listed above).
Notes
-----
The list of authors and the list of affiliations per author are
deduplicated.
"""
fields = 'authors first_author doi title link load_date openaccess_status pii '\
'coverDate endingPage publicationName startingPage api_link volume'
check_field_consistency(self._integrity, fields)
# Parse elements one-by-one
out = []
for item in self._json:
# Get authors and create ";" separated string
authors_list = self._get_authors(item)
authors_list = deduplicate(authors_list)
authors = ';'.join(authors_list)
# Get links
links_found = item.get('link')
links = {'api_link': None, 'scidir': None}
for link in links_found:
if link.get('@ref') == 'self':
links['api_link'] = link.get('@href')
elif link.get('@ref') == 'scidir':
links['scidir'] = link.get('@href')
# Get doi
doi = item.get("prism:doi") or item.get("dc:identifier")[4:] if item.get("dc:identifier") else None
new = Document(
authors=authors,
first_author=item.get('dc:creator'),
doi=doi,
title=item.get("dc:title"),
link=links["scidir"],
load_date=item.get("load-date"),
openaccess_status=item.get("openaccess"),
pii=item.get("pii"),
coverDate=item.get("prism:coverDate"),
endingPage=item.get("prism:endingPage"),
publicationName=item.get("prism:publicationName"),
startingPage=item.get("prism:startingPage"),
api_link=links["api_link"] or item.get("prism:url"),
volume=item.get("prism:volume")
)
out.append(new)
check_integrity(out, self._integrity, self._action)
return out or None
def __init__(self,
query: str,
refresh: bool | int = False,
view: str | None = None,
verbose: bool = False,
download: bool = True,
integrity_fields: list[str] | tuple[str, ...] | None = None,
integrity_action: str = "raise",
subscriber: bool = True,
**kwds: str
) -> None:
"""Interaction with the ScienceDirect Search API. This represents a search against the
ScienceDirect cluster, which contains serial/nonserial full-text articles. Note that this API
replicates the search experience on `ScienceDirect <www.sciencedirect.com>`__.
:param query: A string of the query as used in the `ScienceDirect Search <https://dev.elsevier.com/tecdoc_sdsearch_migration.html>`__.
:param refresh: Whether to refresh the cached file if it exists or not.
If int is passed, cached file will be refreshed if the
number of days since last modification exceeds that value.
:param view: Which view to use for the query, see `the documentation <https://dev.elsevier.com/sd_search_views.html>`__.
Allowed values: `STANDARD`.
:param verbose: Whether to print a download progress bar.
:param download: Whether to download results (if they have not been
cached).
:param integrity_fields: A list or tuple with the names of fields whose completeness should
be checked. `ArticleMetadata` will perform the
action specified in `integrity_action` if
elements in these fields are missing. This
helps to avoid idiosynchratically missing
elements that should always be present
(e.g., doi or authors).
:param integrity_action: What to do in case integrity of provided fields
cannot be verified. Possible actions:
- `"raise"`: Raise an `AttributeError`
- `"warn"`: Raise a `UserWarning`
:param subscriber: Whether you access ScienceDirect with a subscription or not.
For subscribers, ScienceDirect's cursor navigation will be
used. Sets the number of entries in each query
iteration to the maximum number allowed by the
corresponding view.
:param kwds: Keywords passed on as query parameters. Must contain
fields and values mentioned in the `API specification <https://dev.elsevier.com/documentation/ArticleMetadataAPI.wadl>`__.
Raises
------
ScopusQueryError
For non-subscribers, if the number of search results exceeds 5000.
ValueError
If any of the parameters `integrity_action`, `refresh` or `view`
is not one of the allowed values.
Notes
-----
The directory for cached results is `{path}/{view}/{fname}`,
where `path` is specified in your configuration file and `fname` is
the md5-hashed version of `query`.
The ScienceDirect Search API V2 has two available interfaces: `PUT` and `GET`. This library uses the
`GET` interface.
"""
# Check view or set to default
if view:
check_parameter_value(view, VIEWS['ScienceDirectSearch'], "view")
else:
view = "STANDARD"
allowed = ("warn", "raise")
check_parameter_value(integrity_action, allowed, "integrity_action")
# Query
self._action = integrity_action
self._integrity = integrity_fields or []
self._refresh = refresh
self._query = query
self._view = view
Search.__init__(self, query=query, download=download, verbose=verbose, **kwds)
def __str__(self):
"""Print a summary string."""
return make_search_summary(self, "document", self.get_dois())
[docs]
def get_dois(self):
"""DOIs of retrieved documents."""
return [d.get("prism:doi") or d.get("dc:identifier")[4:] if d.get("dc:identifier") else None for d in self._json]
def _get_authors(self, item: dict) -> list:
"""Auxiliary function to get the authors."""
authors_data = chained_get(item, ['authors', 'author'], [])
if isinstance(authors_data, list):
authors_list = [a.get('$') for a in authors_data]
elif isinstance(authors_data, str):
authors_list = [authors_data]
else:
authors_list = []
return authors_list