Pulling Bibliometrics data from PubMed via API

Last updated on Feb 12, 2025

Create an account on https://www.ncbi.nlm.nih.gov/myncbi/. Once registered, click on your email to navigate to account settings. Scroll down to API Key Management. Create an API key to increase your e-utils limit to 10 requests/second. Save the API key to a config.ini file.

Load libraries and configuration file with API key

import json
import pandas as pd
import requests
import pandas as pd
from xml.etree import ElementTree
from urllib.parse import urlencode
import configparser
import warnings
warnings.filterwarnings('ignore')

config = configparser.ConfigParser()
config.read('config.ini')
api_key = config.get('API', 'key')

Create a Dataset to query the API

data = [["OCD", "Obsessive-Compulsive Disorder treatment"], ["GAD", "generalized anxiety disorder treatment"], ["panic disorder", "panic disorder treatment"]]
df = pd.DataFrame(data, columns= ["Name", "Query"])
print(df)

             Name                                    Query
0             OCD  Obsessive-Compulsive Disorder treatment
1             GAD   generalized anxiety disorder treatment
2  panic disorder                 panic disorder treatment

# Your API key
api_key = api_key  # Replace "your_api_key_here" with your actual API key

Define function that will return pmid based on query. Limiting our results to 100 pmids per query.

def query_pubmed(search_term, api_key):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    query_params = {
        "db": "pubmed",
        "term": search_term,
        "retmax": 100,  # Increase the number of maximum results to return to 100
        "sort": "pub+date",  # Sort by publication date (most recent first)
        "api_key": api_key  # Include your API key here
    }
    response = requests.get(base_url, params=query_params)
    if response.status_code == 200:
        root = ElementTree.fromstring(response.content)
        pubmed_ids = [id_elem.text for id_elem in root.findall(".//Id")]
        return pubmed_ids
    else:
        print(f"Error: {response.status_code}")
        return []

Define function to Get paper details for each pmid

For more information on what is available to pull: https://www.nlm.nih.gov/bsd/licensee/elements_descriptions.html#versionid

def get_paper_details(pubmed_id, api_key):
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    query_params = {
        "db": "pubmed",
        "id": pubmed_id,
        "rettype": "abstract",
        "retmode": "xml",
        "api_key": api_key
    }
    response = requests.get(base_url, params=query_params)
    if response.status_code == 200:
        root = ElementTree.fromstring(response.content)
        
        # Check if the article exists
        article = root.find(".//PubmedArticle")
        if article is not None:
            # Extracting information from the XML response
            title = article.find(".//ArticleTitle").text
            abstract_element = article.find(".//AbstractText")
            abstract = abstract_element.text if abstract_element is not None else ""
            date_of_publication = article.find(".//PubDate").findtext(".//Year", default="") + "-" + article.find(".//PubDate").findtext(".//Month", default="") + "-" + article.find(".//PubDate").findtext(".//Day", default="")
            
            # Extracting Clinical Trials ID
            clinical_trials_id = ""
            databank_list = article.find(".//DataBankList")
            if databank_list is not None:
                for databank in databank_list.findall(".//DataBank"):
                    databank_name = databank.find(".//DataBankName")
                    if databank_name is not None and databank_name.text == "ClinicalTrials.gov":
                        accession_number = databank.find(".//AccessionNumber")
                        if accession_number is not None:
                            clinical_trials_id = accession_number.text
            
            # Extracting authors and their affiliations
            authors_info = []
            for author in article.findall(".//Author"):
                last_name = author.findtext(".//LastName", default="")
                first_name = author.findtext(".//ForeName", default="")
                full_name = f"{first_name} {last_name}"
                
                # Extracting author's affiliations
                affiliations = []
                for affiliation in author.findall(".//Affiliation"):
                    affiliations.append(affiliation.text)
                
                authors_info.append({
                    "Author": full_name,
                    "Affiliations": affiliations
                })
            
            # Duplicate data for co-authors
            coauthors_str = "; ".join([author["Author"] for author in authors_info])
            coauthors_affiliations = [affiliation for author in authors_info for affiliation in author["Affiliations"]]
            
            # Extracting Conflict of Interest Statement
            conflict_of_interest_element = article.find(".//CoiStatement")
            conflict_of_interest = conflict_of_interest_element.text if conflict_of_interest_element is not None else ""
            
            # Extracting Journal information
            journal_element = article.find(".//Journal")
            issn = journal_element.findtext(".//ISSN", default="")
            volume = journal_element.findtext(".//Volume", default="")
            issue = journal_element.findtext(".//Issue", default="")
            
            # Extracting Keywords
            keyword_list_element = article.find(".//KeywordList")
            if keyword_list_element is not None:
                keywords = [keyword.text for keyword in keyword_list_element.findall(".//Keyword")]
            else:
                keywords = []
            
            # Generate PubMed URL
            pubmed_url = f"https://pubmed.ncbi.nlm.nih.gov/{pubmed_id}/"
            
            return {
                "Title": title,
                "Abstract": abstract,
                "Date of Publication": date_of_publication,
                "Authors": coauthors_str,
                "Coauthors Affiliations": coauthors_affiliations,
                "Conflict of Interest Statement": conflict_of_interest,
                "ISSN": issn,
                "Volume": volume,
                "Issue": issue,
                "Keywords": keywords,
                "PubMed ID": pubmed_id,
                "PubMed URL": pubmed_url,  # Add PubMed URL to the returned dictionary
                "Clinical Trials ID": clinical_trials_id  # Add Clinical Trials ID to the returned dictionary
            }
        else:
            print("Article not found for PubMed ID:", pubmed_id)
            return {}
    else:
        print(f"Error: {response.status_code}")
        return {}

Loop through the queries


# Initialize an empty list to store the DataFrames for each query
dfs = []

# Iterate over each row and perform PubMed query
for index, row in df.iterrows():
    query = row['Query']
    pubmed_ids = query_pubmed(query, api_key)
    
    # Initialize an empty list to store the DataFrames for each PubMed ID
    paper_dfs = []
    
    # Iterate over each PubMed ID and retrieve paper details
    for pubmed_id in pubmed_ids:
        paper_details = get_paper_details(pubmed_id, api_key)
        
        # Add the query to the paper details
        paper_details["Query"] = query
        
        # Convert paper details to DataFrame and append to list
        paper_df = pd.DataFrame([paper_details])
        paper_dfs.append(paper_df)
    
    # Concatenate DataFrames for each PubMed ID and append to list
    query_df = pd.concat(paper_dfs, ignore_index=True)
    dfs.append(query_df)

# Concatenate DataFrames for each query
results_df = pd.concat(dfs, ignore_index=True)

# Save the results to a CSV file
results_df.to_csv("pubmed_results.csv", index=False)

results_df.head()

	Title	Abstract	Date of Publication	Authors	Coauthors Affiliations	Conflict of Interest Statement	ISSN	Volume	Issue	Keywords	PubMed ID	PubMed URL	Query
0	Insight in obsessive-compulsive disorder: conc...	Obsessive-compulsive disorder (OCD) is a chron...	2023--	Yueqi Huang; Yazhu Weng; Lan Lan; Cheng Zhu; T...	[Department of Psychiatry, Affiliated Mental H...		2634-4416	3		[insight, mental health, neuroimaging, obsessi...	38666121	https://pubmed.ncbi.nlm.nih.gov/38666121/	Obsessive-Compulsive Disorder treatment
1	The use of coaching in smartphone app-based co...	Body dysmorphic disorder (BDD) is severe and u...	2024-Jun-	Emily E Bernstein; Jennifer L Greenberg; Hilar...	[Massachusetts General Hospital, United States...	The authors declare the following financial in...	2214-7829	36		[Body dysmorphic disorder, Clinical trial, Coa...	38660465	https://pubmed.ncbi.nlm.nih.gov/38660465/	Obsessive-Compulsive Disorder treatment
2	Modeling the volume of tissue activated in dee...	Deep brain stimulation (DBS) is a neuromodulat...	2024--	Erin E Patrick; Chance R Fleeting; Drashti R P...	[Department of Electrical and Computer Enginee...	The authors declare that the research was cond...	1662-5161	18		[DBS, VTA, connectivity maps, deep brain stimu...	38660012	https://pubmed.ncbi.nlm.nih.gov/38660012/	Obsessive-Compulsive Disorder treatment
3	Quantitative analysis of noninvasive deep temp...	Deep brain stimulation (DBS) is a method for s...	2024-Apr-30	Zohre Mojiri; Amir Akhavan; Ehsan Rouhani; Say...	[Department of Electrical and Computer Enginee...	The authors declare that they have no known co...	2405-8440	10	8	[Hodgkin-Huxley model, Noninvasive deep brain ...	38655334	https://pubmed.ncbi.nlm.nih.gov/38655334/	Obsessive-Compulsive Disorder treatment
4	Outcomes of Anorexia Nervosa in a Male Patient...	Eating disorders (EDs) are among the most dang...	2024-Apr-	Mohammed Alkhamis; Waad D Alotaibi; Ghaiah J A...	[Psychiatry, King Fahad Hospital, Al-Hofuf, SA...	The authors have declared that no competing in...	2168-8184	16	4	[anorexia nervosa, eating behaviors, eating be...	38654963	https://pubmed.ncbi.nlm.nih.gov/38654963/	Obsessive-Compulsive Disorder treatment

results_df.to_csv("../Data/results.csv", index=False)

Python API

Pulling Bibliometrics data from PubMed via API

MSBA, MSW, Data Science Manager