Pulling Bibliometrics data from PubMed via API

Create an account on https://www.ncbi.nlm.nih.gov/myncbi/. Once registered, click on your email to navigate to account settings. Scroll down to API Key Management. Create an API key to increase your e-utils limit to 10 requests/second. Save the API key to a config.ini file.
Load libraries and configuration file with API key
import json
import pandas as pd
import requests
import pandas as pd
from xml.etree import ElementTree
from urllib.parse import urlencode
import configparser
import warnings
warnings.filterwarnings('ignore')
config = configparser.ConfigParser()
config.read('config.ini')
api_key = config.get('API', 'key')
Create a Dataset to query the API
data = [["OCD", "Obsessive-Compulsive Disorder treatment"], ["GAD", "generalized anxiety disorder treatment"], ["panic disorder", "panic disorder treatment"]]
df = pd.DataFrame(data, columns= ["Name", "Query"])
print(df)
Name Query
0 OCD Obsessive-Compulsive Disorder treatment
1 GAD generalized anxiety disorder treatment
2 panic disorder panic disorder treatment
# Your API key
api_key = api_key # Replace "your_api_key_here" with your actual API key
Define function that will return pmid based on query. Limiting our results to 100 pmids per query.
def query_pubmed(search_term, api_key):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
query_params = {
"db": "pubmed",
"term": search_term,
"retmax": 100, # Increase the number of maximum results to return to 100
"sort": "pub+date", # Sort by publication date (most recent first)
"api_key": api_key # Include your API key here
}
response = requests.get(base_url, params=query_params)
if response.status_code == 200:
root = ElementTree.fromstring(response.content)
pubmed_ids = [id_elem.text for id_elem in root.findall(".//Id")]
return pubmed_ids
else:
print(f"Error: {response.status_code}")
return []
Define function to Get paper details for each pmid
For more information on what is available to pull: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#versionid
def get_paper_details(pubmed_id, api_key):
base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
query_params = {
"db": "pubmed",
"id": pubmed_id,
"rettype": "abstract",
"retmode": "xml",
"api_key": api_key
}
response = requests.get(base_url, params=query_params)
if response.status_code == 200:
root = ElementTree.fromstring(response.content)
# Check if the article exists
article = root.find(".//PubmedArticle")
if article is not None:
# Extracting information from the XML response
title = article.find(".//ArticleTitle").text
abstract_element = article.find(".//AbstractText")
abstract = abstract_element.text if abstract_element is not None else ""
date_of_publication = article.find(".//PubDate").findtext(".//Year", default="") + "-" + article.find(".//PubDate").findtext(".//Month", default="") + "-" + article.find(".//PubDate").findtext(".//Day", default="")
# Extracting Clinical Trials ID
clinical_trials_id = ""
databank_list = article.find(".//DataBankList")
if databank_list is not None:
for databank in databank_list.findall(".//DataBank"):
databank_name = databank.find(".//DataBankName")
if databank_name is not None and databank_name.text == "ClinicalTrials.gov":
accession_number = databank.find(".//AccessionNumber")
if accession_number is not None:
clinical_trials_id = accession_number.text
# Extracting authors and their affiliations
authors_info = []
for author in article.findall(".//Author"):
last_name = author.findtext(".//LastName", default="")
first_name = author.findtext(".//ForeName", default="")
full_name = f"{first_name} {last_name}"
# Extracting author's affiliations
affiliations = []
for affiliation in author.findall(".//Affiliation"):
affiliations.append(affiliation.text)
authors_info.append({
"Author": full_name,
"Affiliations": affiliations
})
# Duplicate data for co-authors
coauthors_str = "; ".join([author["Author"] for author in authors_info])
coauthors_affiliations = [affiliation for author in authors_info for affiliation in author["Affiliations"]]
# Extracting Conflict of Interest Statement
conflict_of_interest_element = article.find(".//CoiStatement")
conflict_of_interest = conflict_of_interest_element.text if conflict_of_interest_element is not None else ""
# Extracting Journal information
journal_element = article.find(".//Journal")
issn = journal_element.findtext(".//ISSN", default="")
volume = journal_element.findtext(".//Volume", default="")
issue = journal_element.findtext(".//Issue", default="")
# Extracting Keywords
keyword_list_element = article.find(".//KeywordList")
if keyword_list_element is not None:
keywords = [keyword.text for keyword in keyword_list_element.findall(".//Keyword")]
else:
keywords = []
# Generate PubMed URL
pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}/"
return {
"Title": title,
"Abstract": abstract,
"Date of Publication": date_of_publication,
"Authors": coauthors_str,
"Coauthors Affiliations": coauthors_affiliations,
"Conflict of Interest Statement": conflict_of_interest,
"ISSN": issn,
"Volume": volume,
"Issue": issue,
"Keywords": keywords,
"PubMed ID": pubmed_id,
"PubMed URL": pubmed_url, # Add PubMed URL to the returned dictionary
"Clinical Trials ID": clinical_trials_id # Add Clinical Trials ID to the returned dictionary
}
else:
print("Article not found for PubMed ID:", pubmed_id)
return {}
else:
print(f"Error: {response.status_code}")
return {}
Loop through the queries
# Initialize an empty list to store the DataFrames for each query
dfs = []
# Iterate over each row and perform PubMed query
for index, row in df.iterrows():
query = row['Query']
pubmed_ids = query_pubmed(query, api_key)
# Initialize an empty list to store the DataFrames for each PubMed ID
paper_dfs = []
# Iterate over each PubMed ID and retrieve paper details
for pubmed_id in pubmed_ids:
paper_details = get_paper_details(pubmed_id, api_key)
# Add the query to the paper details
paper_details["Query"] = query
# Convert paper details to DataFrame and append to list
paper_df = pd.DataFrame([paper_details])
paper_dfs.append(paper_df)
# Concatenate DataFrames for each PubMed ID and append to list
query_df = pd.concat(paper_dfs, ignore_index=True)
dfs.append(query_df)
# Concatenate DataFrames for each query
results_df = pd.concat(dfs, ignore_index=True)
# Save the results to a CSV file
results_df.to_csv("pubmed_results.csv", index=False)
results_df.head()
Title | Abstract | Date of Publication | Authors | Coauthors Affiliations | Conflict of Interest Statement | ISSN | Volume | Issue | Keywords | PubMed ID | PubMed URL | Clinical Trials ID | Query | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Insight in obsessive-compulsive disorder: conc... | Obsessive-compulsive disorder (OCD) is a chron... | 2023-- | Yueqi Huang; Yazhu Weng; Lan Lan; Cheng Zhu; T... | [Department of Psychiatry, Affiliated Mental H... | 2634-4416 | 3 | [insight, mental health, neuroimaging, obsessi... | 38666121 | https://pubmed.ncbi.nlm.nih.gov/38666121/ | Obsessive-Compulsive Disorder treatment | |||
1 | The use of coaching in smartphone app-based co... | Body dysmorphic disorder (BDD) is severe and u... | 2024-Jun- | Emily E Bernstein; Jennifer L Greenberg; Hilar... | [Massachusetts General Hospital, United States... | The authors declare the following financial in... | 2214-7829 | 36 | [Body dysmorphic disorder, Clinical trial, Coa... | 38660465 | https://pubmed.ncbi.nlm.nih.gov/38660465/ | Obsessive-Compulsive Disorder treatment | ||
2 | Modeling the volume of tissue activated in dee... | Deep brain stimulation (DBS) is a neuromodulat... | 2024-- | Erin E Patrick; Chance R Fleeting; Drashti R P... | [Department of Electrical and Computer Enginee... | The authors declare that the research was cond... | 1662-5161 | 18 | [DBS, VTA, connectivity maps, deep brain stimu... | 38660012 | https://pubmed.ncbi.nlm.nih.gov/38660012/ | Obsessive-Compulsive Disorder treatment | ||
3 | Quantitative analysis of noninvasive deep temp... | Deep brain stimulation (DBS) is a method for s... | 2024-Apr-30 | Zohre Mojiri; Amir Akhavan; Ehsan Rouhani; Say... | [Department of Electrical and Computer Enginee... | The authors declare that they have no known co... | 2405-8440 | 10 | 8 | [Hodgkin-Huxley model, Noninvasive deep brain ... | 38655334 | https://pubmed.ncbi.nlm.nih.gov/38655334/ | Obsessive-Compulsive Disorder treatment | |
4 | Outcomes of Anorexia Nervosa in a Male Patient... | Eating disorders (EDs) are among the most dang... | 2024-Apr- | Mohammed Alkhamis; Waad D Alotaibi; Ghaiah J A... | [Psychiatry, King Fahad Hospital, Al-Hofuf, SA... | The authors have declared that no competing in... | 2168-8184 | 16 | 4 | [anorexia nervosa, eating behaviors, eating be... | 38654963 | https://pubmed.ncbi.nlm.nih.gov/38654963/ | Obsessive-Compulsive Disorder treatment |
results_df.to_csv("../Data/results.csv", index=False)