Content Selection
Crawl4AI provides multiple ways to select and filter specific content from webpages. Learn how to precisely target the content you need.
CSS Selectors
The simplest way to extract specific content:
# Extract specific content using CSS selector
result = await crawler.arun(
url="https://example.com",
css_selector=".main-article" # Target main article content
)
# Multiple selectors
result = await crawler.arun(
url="https://example.com",
css_selector="article h1, article .content" # Target heading and content
)
Content Filtering
Control what content is included or excluded:
result = await crawler.arun(
url="https://example.com",
# Content thresholds
word_count_threshold=10, # Minimum words per block
# Tag exclusions
excluded_tags=['form', 'header', 'footer', 'nav'],
# Link filtering
exclude_external_links=True, # Remove external links
exclude_social_media_links=True, # Remove social media links
# Media filtering
exclude_external_images=True # Remove external images
)
Iframe Content
Process content inside iframes:
result = await crawler.arun(
url="https://example.com",
process_iframes=True, # Extract iframe content
remove_overlay_elements=True # Remove popups/modals that might block iframes
)
Structured Content Selection
Using LLMs for Smart Selection
Use LLMs to intelligently extract specific types of content:
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class ArticleContent(BaseModel):
title: str
main_points: List[str]
conclusion: str
strategy = LLMExtractionStrategy(
provider="ollama/nemotron", # Works with any supported LLM
schema=ArticleContent.schema(),
instruction="Extract the main article title, key points, and conclusion"
)
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
article = json.loads(result.extracted_content)
Pattern-Based Selection
For repeated content patterns (like product listings, news feeds):
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
schema = {
"name": "News Articles",
"baseSelector": "article.news-item", # Repeated element
"fields": [
{"name": "headline", "selector": "h2", "type": "text"},
{"name": "summary", "selector": ".summary", "type": "text"},
{"name": "category", "selector": ".category", "type": "text"},
{
"name": "metadata",
"type": "nested",
"fields": [
{"name": "author", "selector": ".author", "type": "text"},
{"name": "date", "selector": ".date", "type": "text"}
]
}
]
}
strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
articles = json.loads(result.extracted_content)
Domain-Based Filtering
Control content based on domains:
result = await crawler.arun(
url="https://example.com",
exclude_domains=["ads.com", "tracker.com"],
exclude_social_media_domains=["facebook.com", "twitter.com"], # Custom social media domains to exclude
exclude_social_media_links=True
)
Media Selection
Select specific types of media:
result = await crawler.arun(url="https://example.com")
# Access different media types
images = result.media["images"] # List of image details
videos = result.media["videos"] # List of video details
audios = result.media["audios"] # List of audio details
# Image with metadata
for image in images:
print(f"URL: {image['src']}")
print(f"Alt text: {image['alt']}")
print(f"Description: {image['desc']}")
print(f"Relevance score: {image['score']}")
Comprehensive Example
Here's how to combine different selection methods:
async def extract_article_content(url: str):
# Define structured extraction
article_schema = {
"name": "Article",
"baseSelector": "article.main",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "content", "selector": ".content", "type": "text"}
]
}
# Define LLM extraction
class ArticleAnalysis(BaseModel):
key_points: List[str]
sentiment: str
category: str
async with AsyncWebCrawler() as crawler:
# Get structured content
pattern_result = await crawler.arun(
url=url,
extraction_strategy=JsonCssExtractionStrategy(article_schema),
word_count_threshold=10,
excluded_tags=['nav', 'footer'],
exclude_external_links=True
)
# Get semantic analysis
analysis_result = await crawler.arun(
url=url,
extraction_strategy=LLMExtractionStrategy(
provider="ollama/nemotron",
schema=ArticleAnalysis.schema(),
instruction="Analyze the article content"
)
)
# Combine results
return {
"article": json.loads(pattern_result.extracted_content),
"analysis": json.loads(analysis_result.extracted_content),
"media": pattern_result.media
}