Advanced Usage

This section covers advanced patterns and sophisticated features demonstrated in the demo scripts.

FindIt: Publisher-Specific PDF Access

FindIt provides sophisticated publisher-specific URL resolution for academic papers:

Basic FindIt Usage

from metapub import FindIt

# Basic usage
src = FindIt('25575644')  # PMID

if src.url:
    print(f"PDF available: {src.url}")
    print(f"Journal: {src.pma.journal}")
else:
    print(f"No access: {src.reason}")
    if src.backup_url:
        print(f"Backup URL: {src.backup_url}")

Publisher Registry

FindIt includes a comprehensive, pre-populated journal registry with 68+ publishers (97.1% coverage) that ships with the package. This provides out-of-the-box functionality without requiring setup or database initialization:

# Registry is automatically available - no setup needed
from metapub.findit.registry import JournalRegistry

registry = JournalRegistry()  # Uses shipped database
stats = registry.get_stats()
print(f"Publishers: {stats['publishers']}")
print(f"Journals: {stats['journals']}")

Advanced FindIt Options

# With error retry
src = FindIt(pmid='12345678', retry_errors=True)

# NIH access mode
src = FindIt(pmid='12345678', use_nih=True)

# Debug mode for troubleshooting
src = FindIt(pmid='12345678', debug=True)

# Skip verification for speed
src = FindIt(pmid='12345678', verify=False)

Embargo Detection

from metapub import FindIt

src = FindIt('25575644')

# Check embargo status
embargo_date = src.pma.history.get('pmc-release', None)
is_embargoed = False

if src.reason.startswith("PAYWALL") and "embargo" in src.reason:
    is_embargoed = True
    print(f"Article is embargoed until: {embargo_date}")

Publisher Coverage Examples

FindIt handles many publisher-specific patterns:

# Test PMIDs for different publishers
test_pmids = {
    'Nature': ['16419642', '18830250', '12187393'],
    'BMC': ['25943194', '20170543', '25927199'],
    'ScienceDirect': ['20000000', '25735572', '24565554'],
    'Wiley': ['14981756', '10474162', '10470409'],
    'JAMA': ['25742465', '23754022', '25739104']
}

for publisher, pmids in test_pmids.items():
    print(f"\n{publisher} results:")
    for pmid in pmids:
        src = FindIt(pmid)
        status = "✓" if src.url else "✗"
        print(f"  {status} {pmid}: {src.pma.journal}")

Clinical and Medical Genetics Queries

Specialized Search Types

from metapub import PubMedFetcher

fetch = PubMedFetcher()

# Clinical queries with categories
pmids = fetch.pmids_for_clinical_query(
    'Global developmental delay',
    'etiology',
    'broad'  # or 'narrow'
)

# Medical genetics queries
pmids = fetch.pmids_for_medical_genetics_query(
    'Brugada Syndrome',
    'diagnosis'  # or 'genetic_counseling', 'prognosis'
)

Advanced Citation Lookup

# Find article by detailed citation
params = {
    'jtitle': 'Genetics in Medicine',
    'year': 2017,
    'volume': 19,
    'first_page': 1105,
    'aulast': 'Nykamp'
}

pmids = fetch.pmids_for_citation(**params)

# Alternative parameter names
params2 = {
    'journal': 'Nature',
    'year': 2023,
    'volume': 615,
    'spage': 123,  # start page
    'authors': 'Smith; Jones; Brown'
}

MedGen and ClinVar Integration

Disease-Gene Mapping

from metapub import MedGenFetcher

mg = MedGenFetcher()

# Disease to gene mapping
term = "diabetes"
uids = mg.uids_by_term(term)

for uid in uids[:5]:  # First 5 results
    concept = mg.concept_by_uid(uid)
    print(f"CUI: {concept.cui}")
    print(f"Name: {concept.name}")
    print(f"Definition: {concept.definition}")

    # Get related PMIDs
    pmids = mg.pubmeds_for_cui(concept.cui)
    print(f"Related articles: {len(pmids)}")

Gene-Condition Mapping

# Gene to condition mapping
gene = "CFTR"
uids = mg.uids_by_term(f"{gene}[gene]")

for uid in uids:
    concept = mg.concept_by_uid(uid)
    if concept.cui:
        print(f"Gene {gene} associated with: {concept.name}")

ClinVar Variant Analysis

from metapub import ClinVarFetcher

cv = ClinVarFetcher()

# Get variant by its ClinVar ID
variant = cv.variant('810732', id_from='clinvar')
"""
This is the ID you see under "Variation ID" on the ClinVar browser: https://www.ncbi.nlm.nih.gov/clinvar/variation/810732/

Specifying id_from='entrez' allows you to query by Entrez's ID.
"""

print(f"Variation name: {variant.variation_name}")
print(f"HGVS notation: {variant.hgvs_c}")
print(f"Clinical significance: {variant.clinical_significance}")
print(f"Molecular consequences: {variant.molecular_consequences}")

CrossRef Integration

DOI Resolution with Fallbacks

from metapub import PubMedFetcher, CrossRefFetcher

fetch = PubMedFetcher()
CR = CrossRefFetcher()

def get_doi_with_fallback(pmid):
    # Try PubMed first
    pma = fetch.article_by_pmid(pmid)
    if pma.doi:
        return pma.doi

    # Fallback to CrossRef
    work = CR.article_by_pma(pma)
    if work and work.score > 80:  # High confidence match
        return work.doi

    return None

Batch Processing with CrossRef

import csv
from metapub.exceptions import InvalidPMID

pmids = ['12345678', '23456789', '34567890']

with open('pmid_doi_mapping.csv', 'w', newline='') as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(['PMID', 'DOI', 'Title', 'Status'])

    for pmid in pmids:
        try:
            pma = fetch.article_by_pmid(pmid)
            doi = get_doi_with_fallback(pmid)
            writer.writerow([pmid, doi or '', pma.title, 'SUCCESS'])
        except InvalidPMID:
            writer.writerow([pmid, '', '', 'INVALID_PMID'])
        except Exception as e:
            writer.writerow([pmid, '', '', f'ERROR: {e}'])

Error Handling Patterns

Robust Error Handling

from metapub.exceptions import MetaPubError, InvalidPMID
import logging

# Configure logging for debugging
logging.getLogger('metapub').setLevel(logging.DEBUG)
logging.getLogger('requests').setLevel(logging.WARNING)

def safe_article_fetch(pmid):
    try:
        article = fetch.article_by_pmid(pmid)
        return article
    except InvalidPMID:
        print(f"Invalid PMID: {pmid}")
        return None
    except MetaPubError as e:
        print(f"MetaPub error for {pmid}: {e}")
        return None
    except Exception as e:
        print(f"Unexpected error for {pmid}: {e}")
        return None

Network Error Recovery

import time
from requests.exceptions import RequestException

def fetch_with_retry(pmid, max_retries=3):
    for attempt in range(max_retries):
        try:
            return fetch.article_by_pmid(pmid)
        except RequestException as e:
            if attempt < max_retries - 1:
                print(f"Network error, retrying in 5 seconds... ({attempt + 1}/{max_retries})")
                time.sleep(5)
            else:
                raise e

Performance Optimization

Caching System Overview

Metapub includes a sophisticated caching system designed to minimize API requests and improve performance. The system has evolved to use SQLite-based persistent storage with thread-safe operations.

Key Features:

Persistent Storage: SQLite database for responses that survive process restarts
Thread Safety: All cache operations are thread-safe using locks
NCBI Compliance: Automatic rate limiting respects NCBI guidelines (3 req/sec without API key, 10 req/sec with)
Response Validation: Only valid XML responses are cached; HTML error pages are rejected
Legacy Compatibility: Works with existing cache files from previous versions

Cache Configuration

import os
from metapub import PubMedFetcher
from metapub.ncbi_client import NCBIClient

# Method 1: Environment variables (traditional)
os.environ['METAPUB_CACHE_DIR'] = '/path/to/large/cache'
os.environ['NCBI_API_KEY'] = 'your_api_key_here'

fetch = PubMedFetcher()

# Method 2: Direct NCBIClient usage (new system)
client = NCBIClient(
    api_key='your_api_key_here',
    cache_path='/path/to/cache/ncbi_cache.db',
    requests_per_second=10,  # Will be capped to NCBI limits
    tool='my_research_tool',
    email='researcher@university.edu'
)

Understanding Cache Behavior

from metapub.ncbi_client import SimpleCache

# Direct cache manipulation
cache = SimpleCache('/path/to/cache.db')

# Cache uses URL + parameters as keys
url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
params = {'db': 'pubmed', 'id': '12345678', 'retmode': 'xml'}

# Check if response is cached
cached_response = cache.get(url, params)
if cached_response:
    print("Response found in cache")
else:
    print("Fresh API request needed")

# Manual cache storage (normally done automatically)
cache.set(url, params, xml_response_string)

Rate Limiting and Performance

from metapub.ncbi_client import RateLimiter
import time

# Understanding rate limits
rate_limiter = RateLimiter(requests_per_second=3)  # Without API key

start_time = time.time()
for i in range(5):
    rate_limiter.wait_if_needed()
    print(f"Request {i+1} at {time.time() - start_time:.2f}s")
    # Your API request here

# Output shows requests spaced by ~0.33 seconds (3 per second)

Cache Database Schema

The cache uses a simple SQLite schema compatible with existing cache files:

CREATE TABLE cache (
    key BLOB PRIMARY KEY,      -- URL + sorted parameters
    value BLOB,                -- Cached response data
    created INTEGER,           -- Unix timestamp
    value_compressed BOOL DEFAULT 0  -- Legacy compression flag
);

Advanced Cache Management

import sqlite3
import os
from metapub.cache_utils import get_cache_path, cleanup_dir

# Inspect cache contents
cache_path = get_cache_path()
if cache_path and os.path.exists(cache_path):
    with sqlite3.connect(cache_path) as conn:
        # Count cached entries
        count = conn.execute("SELECT COUNT(*) FROM cache").fetchone()[0]
        print(f"Cache contains {count} entries")

        # Find oldest entries
        oldest = conn.execute(
            "SELECT created FROM cache ORDER BY created LIMIT 1"
        ).fetchone()
        if oldest:
            import datetime
            oldest_date = datetime.datetime.fromtimestamp(oldest[0])
            print(f"Oldest entry: {oldest_date}")

# Clear entire cache directory
if cache_path:
    cache_dir = os.path.dirname(cache_path)
    cleanup_dir(cache_dir)
    print("Cache cleared")

Traditional vs Modern Caching System

Traditional System: - Dictionary-style access with pickle serialization - Backward compatible with existing cache files - Used by PubMedFetcher and other high-level classes

Modern System (NCBIClient): - URL-based caching with parameter normalization - JSON serialization for complex objects - Better thread safety and error handling - Validation prevents caching of error responses

# Traditional style (still supported)
from metapub import PubMedFetcher
fetch = PubMedFetcher()  # Uses traditional caching

# Modern style (recommended for new code)
from metapub.ncbi_client import NCBIClient
client = NCBIClient(cache_path='/path/to/cache.db')
response = client.efetch(db='pubmed', id='12345678')

Caching Strategies

Batch Processing Optimization

# Process PMIDs in batches
def process_pmids_batch(pmids, batch_size=100):
    results = []

    for i in range(0, len(pmids), batch_size):
        batch = pmids[i:i + batch_size]
        print(f"Processing batch {i//batch_size + 1}...")

        for pmid in batch:
            try:
                article = fetch.article_by_pmid(pmid)
                results.append((pmid, article))
            except Exception as e:
                print(f"Error with {pmid}: {e}")

        # Rate limiting between batches
        time.sleep(1)

    return results

Preloading and Cache Warming

# Preload FindIt cache for a list of PMIDs
def preload_findit_cache(pmid_file):
    with open(pmid_file, 'r') as f:
        pmids = [line.strip() for line in f if line.strip()]

    print(f"Preloading FindIt cache for {len(pmids)} PMIDs...")

    for i, pmid in enumerate(pmids):
        if i % 100 == 0:
            print(f"Progress: {i}/{len(pmids)}")

        try:
            src = FindIt(pmid)
            # Just accessing it loads into cache
        except Exception as e:
            print(f"Error preloading {pmid}: {e}")

URL Reverse Engineering

Extract Identifiers from URLs

from metapub.urlreverse import UrlReverse

# Extract DOI and PMID from URLs
urls = [
    'https://doi.org/10.1038/nature12373',
    'https://pubmed.ncbi.nlm.nih.gov/12345678/',
    'https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3458974/'
]

for url in urls:
    urlrev = UrlReverse(url)
    print(f"URL: {url}")
    print(f"DOI: {urlrev.doi}")
    print(f"PMID: {urlrev.pmid}")
    print(f"PMC: {urlrev.pmcid}")
    print("Steps taken:")
    for step in urlrev.steps:
        print(f"  * {step}")
    print()

Troubleshooting and Debugging

Common Issues and Solutions

# Enable detailed logging
import logging
logging.basicConfig(level=logging.DEBUG)

# Check NCBI service health
from metapub.ncbi_health_check import main as health_check
health_check()  # Run health check

# Validate PMIDs before processing
import re
pmid_pattern = re.compile(r'^\d+$')

def is_valid_pmid(pmid):
    return pmid_pattern.match(str(pmid)) is not None

# Clear cache if having issues
import shutil
from metapub.cache_utils import get_cache_path

cache_dir = get_cache_path()
if os.path.exists(cache_dir):
    shutil.rmtree(cache_dir)
    print(f"Cleared cache directory: {cache_dir}")