from collections import namedtuple
from warnings import warn
from typing import List, NamedTuple, Optional, Tuple, Union
from json import loads
from .author_search import AuthorSearch
from .scopus_search import ScopusSearch
from pybliometrics.scopus.superclasses import Retrieval
from pybliometrics.scopus.utils import chained_get, check_parameter_value,\
filter_digits, get_content, get_link, html_unescape, listify, make_int_if_possible,\
parse_affiliation, parse_date_created
[docs]
class AuthorRetrieval(Retrieval):
@property
def affiliation_current(self) -> Optional[List[NamedTuple]]:
"""A list of namedtuples representing the authors's current
affiliation(s), in the form `(id parent type relationship afdispname
preferred_name parent_preferred_name country_code country address_part
city state postal_code org_domain org_URL)`.
Note: Affiliation information might be missing or mal-assigned even
when it lookes correct in the web view. In this case please request
a correction.
"""
if self._view in ('STANDARD', 'ENHANCED'):
affs = chained_get(self._profile, ["affiliation-current", "affiliation"])
elif self._view == 'LIGHT':
affs = self._json.get('affiliation-current')
else:
return None
return parse_affiliation(affs or {}, self._view)
@property
def affiliation_history(self) -> Optional[List[NamedTuple]]:
"""A list of namedtuples representing the authors's historical
affiliation(s), in the form `(id parent type relationship afdispname
preferred_name parent_preferred_name country_code country address_part
city state postal_code org_domain org_URL)`.
Note: Affiliation information might be missing or mal-assigned even
when it lookes correct in the web view. In this case please request
a correction.
Note: Unlike on their website, Scopus doesn't provide the periods
of affiliation.
"""
affs = chained_get(self._profile, ["affiliation-history", "affiliation"])
return parse_affiliation(affs or {}, self._view)
@property
def alias(self) -> Optional[List[str]]:
"""List of possible new Scopus Author Profile IDs in case the profile
has been merged.
"""
return self._alias
@property
def citation_count(self) -> int:
"""Total number of citing items."""
return int(self._json['coredata']['citation-count'])
@property
def cited_by_count(self) -> int:
"""Total number of citing authors."""
return int(self._json['coredata']['cited-by-count'])
@property
def classificationgroup(self) -> Optional[List[Tuple[int, int]]]:
"""List with tuples with form`(subject group ID, number of documents)`."""
path = ['classificationgroup', 'classifications', 'classification']
out = [(int(filter_digits(item['$'])), int(filter_digits(item['@frequency'])))
for item in listify(chained_get(self._profile, path, []))]
return out or None
@property
def coauthor_count(self) -> Optional[int]:
"""Total number of coauthors."""
return make_int_if_possible(chained_get(self._json, ['coauthor-count']))
@property
def coauthor_link(self) -> Optional[str]:
"""URL to Scopus API search page for coauthors."""
return get_link(self._json, 3)
@property
def date_created(self) -> Optional[Tuple[int, int, int]]:
"""Date the Scopus record was created."""
try:
return parse_date_created(self._profile)
except KeyError:
return None
@property
def document_count(self) -> int:
"""Number of documents authored (excludes book chapters and notes)."""
return int(self._json['coredata']['document-count'])
@property
def eid(self) -> Optional[str]:
"""The EID of the author. If it differs from the one provided,
pybliometrics will throw a warning informing the user about
author profile merges.
"""
return self._json['coredata'].get('eid')
@property
def given_name(self) -> Optional[str]:
"""Author's preferred given name."""
return html_unescape(chained_get(self._profile, ['preferred-name', 'given-name']))
@property
def h_index(self) -> Optional[str]:
"""The author's h-index."""
return make_int_if_possible(chained_get(self._json, ['h-index']))
@property
def historical_identifier(self) -> Optional[List[int]]:
"""Scopus IDs of previous profiles now compromising this profile."""
hist = chained_get(self._json, ["coredata", 'historical-identifier'], [])
return [int(d['$'].split(":")[-1]) for d in hist] or None
@property
def identifier(self) -> int:
"""The author's ID. Might differ from the one provided."""
ident = self._json['coredata']['dc:identifier'].split(":")[-1]
if ident != self._id:
text = f"Profile with ID {self._id} has been merged and the new "\
f"ID is {ident}. Please update your records manually. "\
"Files have been cached with the old ID."
warn(text, UserWarning)
return int(ident)
@property
def indexed_name(self) -> Optional[str]:
"""Author's name as indexed by Scopus."""
if self._view in ('STANDARD', 'ENHANCED'):
indexed_name = html_unescape(chained_get(self._profile, ['preferred-name', 'indexed-name']))
elif self._view == 'LIGHT':
# Try to get indexed name from name-variants
name_variants = chained_get(self._json, ['name-variants', 'name-variant'])
if name_variants:
indexed_name = chained_get(name_variants[0], ['name-variant', 'indexed-name'])
else:
# In case of no name-variants get name from preferred-name
preferred_name = self._json.get('preferred-name')
indexed_name = ' '.join([preferred_name.get('initials', ''), preferred_name.get('surname', '')])
else:
indexed_name = None
return indexed_name
@property
def initials(self) -> Optional[str]:
"""Author's preferred initials."""
return html_unescape(chained_get(self._profile, ['preferred-name', 'initials']))
@property
def name_variants(self) -> Optional[List[NamedTuple]]:
"""List of named tuples containing variants of the author name with
number of documents published with that variant.
"""
fields = 'indexed_name initials surname given_name doc_count'
variant = namedtuple('Variant', fields)
out = [variant(indexed_name=html_unescape(var['indexed-name']), surname=html_unescape(var['surname']),
doc_count=var.get('@doc-count'), initials=html_unescape(var['initials']),
given_name=html_unescape(var.get('given-name')))
for var in listify(self._profile.get('name-variant', []))]
return out or None
@property
def orcid(self) -> Optional[str]:
"""The author's ORCID."""
return self._json['coredata'].get('orcid')
@property
def publication_range(self) -> Optional[Tuple[int, int]]:
"""Tuple containing years of first and last publication."""
if self._view in ('STANDARD', 'ENHANCED', 'LIGHT'):
if self._view in ('STANDARD', 'ENHANCED'):
r = self._profile.get('publication-range')
start = '@start'
end = '@end'
elif self._view == 'LIGHT':
r = self._json.get('publication-range')
start = 'start'
end = 'end'
try:
return int(r.get(start)), int(r.get(end))
except TypeError:
return None
return None
@property
def scopus_author_link(self) -> Optional[str]:
"""Link to the Scopus web view of the author."""
return get_link(self._json, 1)
@property
def search_link(self) -> Optional[str]:
"""URL to the API page listing documents of the author."""
return get_link(self._json, 2)
@property
def self_link(self) -> Optional[str]:
"""Link to the author's API page."""
return get_link(self._json, 0)
@property
def status(self) -> Optional[str]:
"""The status of the author profile."""
return self._profile.get("status")
@property
def subject_areas(self) -> Optional[List[NamedTuple]]:
"""List of named tuples of subject areas in the form
`(area, abbreviation, code)` of author's publication.
"""
path = ['subject-areas', 'subject-area']
area = namedtuple('Subjectarea', 'area abbreviation code')
areas = [area(area=item['$'], code=int(item['@code']),
abbreviation=item['@abbrev'])
for item in chained_get(self._json, path, [])]
return areas or None
@property
def surname(self) -> Optional[str]:
"""Author's preferred surname."""
return html_unescape(chained_get(self._profile, ['preferred-name', 'surname']))
@property
def url(self) -> Optional[str]:
"""URL to the author's API page."""
return self._json['coredata']['prism:url']
def __init__(self,
author_id: Union[int, str],
refresh: Union[bool, int] = False,
view: str = "ENHANCED",
**kwds: str
) -> None:
"""Interaction with the Author Retrieval API.
:param author_id: The ID or the EID of the author.
:param refresh: Whether to refresh the cached file if it exists or not.
If int is passed, cached file will be refreshed if the
number of days since last modification exceeds that value.
:param view: The view of the file that should be downloaded. Allowed
values: `METRICS`, `LIGHT`, `STANDARD`, `ENHANCED`, where `STANDARD`
includes all information of `LIGHT` view and `ENHANCED`
includes all information of any view. For details see
https://dev.elsevier.com/sc_author_retrieval_views.html.
Note: Neither the `BASIC` nor the `DOCUMENTS` view are active,
although documented.
:param kwds: Keywords passed on as query parameters. Must contain
fields and values mentioned in the API specification at
https://dev.elsevier.com/documentation/AuthorRetrievalAPI.wadl.
Raises
------
ValueError
If any of the parameters `refresh` or `view` is not
one of the allowed values.
Notes
-----
The directory for cached results is `{path}/ENHANCED/{author_id}`,
where `path` is specified in your configuration file, and `author_id`
is stripped of an eventually leading `'9-s2.0-'`.
"""
# Checks
allowed_views = ('METRICS', 'LIGHT', 'STANDARD', 'ENHANCED')
check_parameter_value(view, allowed_views, "view")
# Load json
self._id = str(author_id).split('-')[-1]
self._view = view
self._refresh = refresh
Retrieval.__init__(self, identifier=self._id,
api='AuthorRetrieval', **kwds)
# Parse json
self._json = self._json['author-retrieval-response']
try:
self._json = self._json[0]
except KeyError: # Incomplete forward
alias_json = listify(self._json['alias']['prism:url'])
self._alias = [d['$'].split(':')[-1] for d in alias_json]
alias_str = ', '.join(self._alias)
text = f'Author profile with ID {author_id} has been merged and '\
f'the main profile is now one of {alias_str}. Please update '\
'your records manually. Functionality of this object is '\
'reduced.'
warn(text, UserWarning)
else:
self._alias = None
self._profile = self._json.get("author-profile", {})
def __str__(self):
"""Return a summary string."""
if self._view in ('STANDARD', 'ENHANCED', 'LIGHT'):
date = self.get_cache_file_mdate().split()[0]
main_aff = self.affiliation_current[0]
s = f"{self.indexed_name} from {main_aff.preferred_name} in "\
f"{main_aff.country},\npublished {int(self.document_count):,} "\
f"document(s) since {self.publication_range[0]} "\
f"\nwhich were cited by {int(self.cited_by_count):,} author(s) in "\
f"{int(self.citation_count):,} document(s) as of {date}"
elif self._view == 'METRICS':
s = f'Author with ID {self._id}\n'\
f'published {int(self.document_count):,} document(s)\n'\
f'which were cited by {int(self.cited_by_count):,} author(s) '\
f'in {int(self.citation_count):,} document(s)'
return s
[docs]
def get_coauthors(self) -> Optional[List[NamedTuple]]:
"""Retrieves basic information about co-authors as a list of
namedtuples in the form
`(surname, given_name, id, areas, affiliation_id, name, city, country)`,
where areas is a list of subject area codes joined by `"; "`.
Note: Method retrieves information via individual queries which will
not be cached. The Scopus API returns 160 coauthors at most.
"""
SIZE = 25
# Get number of authors to search for
url = self.coauthor_link
if not url:
return None
res = get_content(url, api="AuthorSearch")
data = loads(res.text)['search-results']
N = int(data.get('opensearch:totalResults', 0))
# Store information in namedtuples
fields = 'surname given_name id areas affiliation_id name city country'
coauth = namedtuple('Coauthor', fields)
coauthors = []
# Iterate over search results in chunks of `SIZE` results
count = SIZE
start = 0
while start < N:
params = {'start': start, 'count': count, 'accept': 'json'}
res = get_content(url, api="AuthorSearch", params=params)
data = loads(res.text)['search-results'].get('entry', [])
# Extract information for each coauthor
for entry in data:
aff = entry.get('affiliation-current', {})
try:
areas = [a['$'] for a in entry.get('subject-area', [])]
except TypeError: # Only one subject area given
areas = [entry['subject-area']['$']]
new = coauth(surname=entry['preferred-name']['surname'],
given_name=entry['preferred-name'].get('given-name'),
id=int(entry['dc:identifier'].split(':')[-1]),
areas='; '.join(areas), name=aff.get('affiliation-name'),
affiliation_id=aff.get('affiliation-id'),
city=aff.get('affiliation-city'),
country=aff.get('affiliation-country'))
coauthors.append(new)
start += SIZE
return coauthors or None
[docs]
def get_documents(self,
subtypes: List[str] = None,
*args: str, **kwds: str
) -> Optional[List[NamedTuple]]:
"""Return list of the author's publications using a `ScopusSearch()`
query, where publications may fit a specified set of document subtypes.
:param subtypes: The type of documents that should be returned.
:param args: Parameters to be passed on to `ScopusSearch()`.
:param kwds: Parameters to be passed on to `ScopusSearch()`.
Note: To update these results, use `refresh`; the class' `refresh`
parameter is not used here.
"""
s = ScopusSearch(f'AU-ID({self.identifier})', **kwds)
if subtypes:
return [p for p in s.results if p.subtype in subtypes]
else:
return s.results
[docs]
def get_document_eids(self,
*args: str, **kwds: str
) -> Optional[List[str]]:
"""Return list of EIDs of the author's publications using
a ScopusSearch() query.
:param args: Parameters to be passed on to `ScopusSearch()`.
:param kwds: Parameters to be passed on to `ScopusSearch()`.
Note: To update these results, use `refresh`; the class' `refresh`
parameter is not used here.
"""
s = ScopusSearch(f'AU-ID({self.identifier})', *args, **kwds)
return s.get_eids()
[docs]
def estimate_uniqueness(self,
query: str = None,
*args: str,
**kwds: str
) -> int:
"""Return the number of Scopus author profiles similar to this profile
via calls with `AuthorSearch()`.
:param query: The query string to perform to search for authors. If
`None`, the query is of form `"AUTHLAST() AND AUTHFIRST()"`
with the corresponding information included. Provided
queries may include `"SUBJAREA()" OR "AF-ID() AND
SUBJAREA()"`. For details see
https://dev.elsevier.com/tips/AuthorSearchTips.htm.
:param args: Parameters to be passed on to `AuthorSearch()`.
:param kwds: Parameters to be passed on to `AuthorSearch()`.
"""
if not query:
query = f"AUTHLAST({self.surname}) AND AUTHFIRST({self.given_name})"
s = AuthorSearch(query, *args, **kwds)
return s.get_results_size()