Tutorials
Real-World Workflows and Use Cases
This section provides step-by-step tutorials for common research workflows using Metapub.
Tutorial 1: Building a Literature Review Dataset
This tutorial shows how to systematically collect and analyze papers for a literature review.
Step 1: Define Your Search Strategy
from metapub import PubMedFetcher
import pandas as pd
from datetime import datetime
fetch = PubMedFetcher()
# Define search parameters
search_terms = [
'machine learning AND genomics',
'artificial intelligence AND genetics',
'deep learning AND biomarker'
]
date_range = {
'since': '2020/01/01',
'until': '2024/12/31'
}
Step 2: Collect PMIDs
all_pmids = set() # Use set to avoid duplicates
for term in search_terms:
print(f"Searching for: {term}")
pmids = fetch.pmids_for_query(
query=term,
retmax=500, # Adjust based on needs
**date_range
)
all_pmids.update(pmids)
print(f"Found {len(pmids)} papers")
print(f"Total unique papers: {len(all_pmids)}")
Step 3: Extract Article Metadata
from metapub.exceptions import InvalidPMID
articles_data = []
for i, pmid in enumerate(all_pmids):
if i % 50 == 0:
print(f"Processed {i}/{len(all_pmids)} articles")
try:
article = fetch.article_by_pmid(pmid)
# Extract key information
data = {
'pmid': pmid,
'title': article.title,
'journal': article.journal,
'year': article.year,
'doi': article.doi,
'authors': '; '.join([str(author) for author in article.authors]),
'abstract': article.abstract,
'mesh_terms': '; '.join(article.mesh_headings) if article.mesh_headings else '',
'publication_types': '; '.join(article.publication_types) if article.publication_types else ''
}
articles_data.append(data)
except InvalidPMID:
print(f"Invalid PMID: {pmid}")
except Exception as e:
print(f"Error processing {pmid}: {e}")
Step 4: Analyze and Export
# Create DataFrame
df = pd.DataFrame(articles_data)
# Basic analysis
print(f"Total articles collected: {len(df)}")
print(f"Year range: {df['year'].min()} - {df['year'].max()}")
print(f"Top 10 journals:")
print(df['journal'].value_counts().head(10))
# Export results
df.to_csv(f'literature_review_{datetime.now().strftime("%Y%m%d")}.csv', index=False)
print("Results exported to CSV")
Tutorial 2: FindIt Batch Processing for Full-Text Access
This tutorial demonstrates how to systematically check full-text availability for a collection of papers.
Step 1: Prepare PMID List
from metapub import FindIt
import csv
import time
# Load PMIDs from various sources
def load_pmids_from_file(filename):
pmids = []
with open(filename, 'r') as f:
for line in f:
pmid = line.strip()
if pmid.isdigit():
pmids.append(pmid)
return pmids
# Or from previous search
pmids = ['25575644', '25700512', '25554792'] # Example PMIDs
Step 2: Batch FindIt Processing
def process_findit_batch(pmids, output_file='findit_results.csv'):
results = []
with open(output_file, 'w', newline='') as csvfile:
fieldnames = ['pmid', 'journal', 'title', 'url_available', 'url', 'reason', 'backup_url', 'embargo_status']
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for i, pmid in enumerate(pmids):
print(f"Processing {pmid} ({i+1}/{len(pmids)})")
try:
src = FindIt(pmid, retry_errors=True)
# Check embargo status
embargo_date = src.pma.history.get('pmc-release', None)
embargo_status = 'embargoed' if (
src.reason.startswith("PAYWALL") and "embargo" in src.reason
) else 'not_embargoed'
result = {
'pmid': pmid,
'journal': src.pma.journal,
'title': src.pma.title,
'url_available': bool(src.url),
'url': src.url or '',
'reason': src.reason,
'backup_url': src.backup_url or '',
'embargo_status': embargo_status
}
writer.writerow(result)
results.append(result)
except Exception as e:
print(f"Error processing {pmid}: {e}")
# Rate limiting
time.sleep(0.5)
return results
Step 3: Analyze Access Patterns
def analyze_access_results(results):
df = pd.DataFrame(results)
print("=== Full-Text Access Analysis ===")
print(f"Total articles: {len(df)}")
print(f"URL available: {df['url_available'].sum()} ({df['url_available'].mean()*100:.1f}%)")
print(f"Embargoed articles: {(df['embargo_status'] == 'embargoed').sum()}")
print("\n=== Access by Journal ===")
journal_access = df.groupby('journal')['url_available'].agg(['count', 'sum', 'mean'])
journal_access.columns = ['total', 'available', 'access_rate']
journal_access['access_rate'] = journal_access['access_rate'] * 100
print(journal_access.sort_values('access_rate', ascending=False))
print("\n=== Common Failure Reasons ===")
failed = df[~df['url_available']]
print(failed['reason'].value_counts().head(10))
Tutorial 3: Clinical Genetics Research Workflow
This tutorial shows how to research genetic conditions using MedGen and ClinVar integration.
Step 1: Condition to Gene Discovery
from metapub import MedGenFetcher, ClinVarFetcher, PubMedFetcher
mg = MedGenFetcher()
cv = ClinVarFetcher()
fetch = PubMedFetcher()
def research_condition(condition_name):
print(f"=== Researching: {condition_name} ===")
# Step 1: Find MedGen concepts
concepts = mg.concepts_for_term(condition_name)
if not concepts:
print("No MedGen concepts found")
return
main_concept = concepts[0] # Use primary concept
print(f"Main concept: {main_concept.name} (CUI: {main_concept.cui})")
print(f"Definition: {main_concept.definition}")
return main_concept
Step 2: Find Associated Genes
def find_associated_genes(concept):
# Get related PMIDs from MedGen
pmids = mg.pubmeds_for_cui(concept.cui)
print(f"Found {len(pmids)} related articles")
# Analyze abstracts for gene mentions
gene_mentions = {}
for pmid in pmids[:20]: # Limit for demo
try:
article = fetch.article_by_pmid(pmid)
if article.abstract:
# Simple gene pattern matching (improve as needed)
import re
gene_pattern = r'\b[A-Z][A-Z0-9]{2,}\b' # Basic gene pattern
genes = re.findall(gene_pattern, article.abstract)
for gene in genes:
if gene not in gene_mentions:
gene_mentions[gene] = 0
gene_mentions[gene] += 1
except Exception as e:
continue
# Sort by frequency
top_genes = sorted(gene_mentions.items(), key=lambda x: x[1], reverse=True)
print(f"Top mentioned genes: {top_genes[:10]}")
return top_genes
Step 3: ClinVar Variant Analysis
def analyze_clinvar_variants(gene_list):
for gene, count in gene_list[:5]: # Top 5 genes
print(f"\n=== ClinVar variants for {gene} ===")
try:
# Search for variants in this gene
variants = cv.variants_for_gene(gene)
if variants:
print(f"Found {len(variants)} variants")
# Analyze clinical significance
significance_counts = {}
for variant in variants[:10]: # Limit for demo
sig = variant.clinical_significance
if sig:
significance_counts[sig] = significance_counts.get(sig, 0) + 1
print("Clinical significance distribution:")
for sig, count in significance_counts.items():
print(f" {sig}: {count}")
except Exception as e:
print(f"Error analyzing {gene}: {e}")
Step 4: Generate Research Summary
def generate_research_summary(condition_name):
# Run the full workflow
concept = research_condition(condition_name)
if not concept:
return
genes = find_associated_genes(concept)
analyze_clinvar_variants(genes)
# Generate bibliography
pmids = mg.pubmeds_for_cui(concept.cui)
print(f"\n=== Key References for {condition_name} ===")
for pmid in pmids[:5]: # Top 5 references
try:
article = fetch.article_by_pmid(pmid)
print(f"PMID {pmid}: {article.title}")
print(f" {article.journal} ({article.year})")
print(f" DOI: {article.doi}")
print()
except Exception:
continue
# Run the analysis
generate_research_summary("Brugada syndrome")
Tutorial 4: Journal Analysis and Metrics
This tutorial shows how to analyze publication patterns and journal metrics.
Step 1: Collect Journal Data
def analyze_journal_publication_patterns(journal_name, years_back=5):
from datetime import datetime, timedelta
current_year = datetime.now().year
start_year = current_year - years_back
yearly_data = []
for year in range(start_year, current_year + 1):
print(f"Analyzing {journal_name} for {year}")
pmids = fetch.pmids_for_query(
journal=journal_name,
year=year,
retmax=1000 # Adjust as needed
)
# Sample articles for analysis
sample_size = min(50, len(pmids))
sample_pmids = pmids[:sample_size]
articles = []
for pmid in sample_pmids:
try:
article = fetch.article_by_pmid(pmid)
articles.append(article)
except Exception:
continue
yearly_data.append({
'year': year,
'total_articles': len(pmids),
'analyzed_articles': articles
})
return yearly_data
Step 2: Analyze Publication Trends
def analyze_publication_trends(yearly_data):
import matplotlib.pyplot as plt
years = [data['year'] for data in yearly_data]
counts = [data['total_articles'] for data in yearly_data]
# Publication volume trend
plt.figure(figsize=(10, 6))
plt.plot(years, counts, marker='o')
plt.title('Publication Volume Over Time')
plt.xlabel('Year')
plt.ylabel('Number of Articles')
plt.grid(True)
plt.show()
# Analyze author patterns
all_authors = []
for data in yearly_data:
for article in data['analyzed_articles']:
if article.authors:
all_authors.extend([str(author) for author in article.authors])
from collections import Counter
author_counts = Counter(all_authors)
print("Most prolific authors:")
for author, count in author_counts.most_common(10):
print(f" {author}: {count} papers")
Tutorial 5: Enriching PubMed Results with CrossRef Data
This tutorial demonstrates how to use CrossRef to enrich PubMed articles with additional metadata like citation counts and licensing information.
Step 1: Search PubMed and Collect Articles
from metapub import PubMedFetcher, CrossRefFetcher
fetch = PubMedFetcher()
cr = CrossRefFetcher()
# Search PubMed for your topic
pmids = fetch.pmids_for_query('CRISPR gene therapy', retmax=20)
print(f"Found {len(pmids)} PubMed results")
Step 2: Enrich with CrossRef Metadata
CrossRefFetcher can look up articles by DOI, by title, or directly from a
PubMedArticle object. The article_by_pma method uses title similarity
matching to find the best CrossRef match.
enriched = []
for pmid in pmids:
article = fetch.article_by_pmid(pmid)
# Look up on CrossRef using the PubMedArticle directly
cr_work = cr.article_by_pma(article)
result = {
'pmid': pmid,
'title': article.title,
'journal': article.journal,
'year': article.year,
'doi': article.doi,
}
if cr_work:
result['citation_count'] = cr_work.cited_by_count
result['cr_publisher'] = cr_work.publisher
enriched.append(result)
print(f"PMID {pmid}: {article.title[:60]}...")
Step 3: Look Up a Single Article by DOI or Title
You can also query CrossRef directly when you have a DOI or title:
# By DOI
work = cr.article_by_doi('10.1038/s41586-020-2649-2')
print(f"{work.title} — cited {work.cited_by_count} times")
# By title (returns best match)
work = cr.article_by_title('CRISPR-Cas9 gene editing for sickle cell disease')
if work:
print(f"Found: {work.title}")
print(f"DOI: {work.doi}")
print(f"Publisher: {work.publisher}")