1 min read
Web ScrapingCheerio: Schnelles HTML Parsing ohne Browser
Cheerio für effizientes Web Scraping. jQuery-Syntax für Server-Side HTML Parsing ohne Browser-Overhead.
CheerioHTML ParsingWeb ScrapingjQueryNode.jsDOM Manipulation

Cheerio: Schnelles HTML Parsing ohne Browser
Meta-Description: Cheerio für effizientes Web Scraping. jQuery-Syntax für Server-Side HTML Parsing ohne Browser-Overhead.
Keywords: Cheerio, HTML Parsing, Web Scraping, jQuery, Node.js, DOM Manipulation, Data Extraction
Einführung
Cheerio ist eine ultraschnelle HTML Parsing Library für Node.js. Mit jQuery-ähnlicher Syntax parst es HTML serverseitig – ohne Browser, ohne JavaScript-Rendering, nur pure Geschwindigkeit.
Cheerio vs Browser-basierte Tools
┌─────────────────────────────────────────────────────────────┐
│ CHEERIO CHARAKTERISTIKEN │
├─────────────────────────────────────────────────────────────┤
│ │
│ Vorteile: │
│ ├── Extrem schnell (kein Browser-Overhead) │
│ ├── Geringer Memory-Footprint │
│ ├── Vertraute jQuery-Syntax │
│ ├── Serverless-freundlich │
│ └── Ideal für Static HTML │
│ │
│ Einschränkungen: │
│ ├── Kein JavaScript-Rendering │
│ ├── Keine dynamischen Inhalte │
│ ├── Keine Browser-APIs │
│ └── Kein Visual Rendering │
│ │
│ Use Cases: │
│ ├── Static Website Scraping │
│ ├── HTML Email Parsing │
│ ├── RSS/XML Feed Processing │
│ ├── HTML Sanitization │
│ └── Content Extraction │
│ │
└─────────────────────────────────────────────────────────────┘Setup
npm install cheerio axios// scraper.ts
import * as cheerio from 'cheerio';
import axios from 'axios';
async function scrape(url: string) {
// HTML fetchen
const response = await axios.get(url, {
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
}
});
// Cheerio laden
const $ = cheerio.load(response.data);
// jQuery-Syntax für Selektion
const title = $('h1').text();
const description = $('meta[name="description"]').attr('content');
return { title, description };
}Selektoren & Navigation
import * as cheerio from 'cheerio';
const html = `
<html>
<body>
<div class="container">
<h1>Title</h1>
<p class="intro">Introduction text</p>
<ul id="items">
<li data-id="1">Item 1</li>
<li data-id="2">Item 2</li>
<li data-id="3">Item 3</li>
</ul>
<a href="/page1">Link 1</a>
<a href="/page2" class="external">Link 2</a>
</div>
</body>
</html>
`;
const $ = cheerio.load(html);
// Basis Selektoren
const title = $('h1').text(); // "Title"
const intro = $('.intro').text(); // "Introduction text"
const firstItem = $('#items li').first().text(); // "Item 1"
// Attribute
const href = $('a').attr('href'); // "/page1"
const dataId = $('li').first().data('id'); // 1
// Traversierung
const items = $('li').map((i, el) => $(el).text()).get();
// ["Item 1", "Item 2", "Item 3"]
// Parent/Child Navigation
const parent = $('li').first().parent().attr('id'); // "items"
const children = $('#items').children().length; // 3
const siblings = $('li').first().siblings().length; // 2
// Filtering
const externalLinks = $('a.external').map((i, el) => $(el).attr('href')).get();
// Contains
const itemWith2 = $('li:contains("2")').text(); // "Item 2"
// Multiple Selectors
const headings = $('h1, h2, h3').map((i, el) => $(el).text()).get();Datenextraktion
import * as cheerio from 'cheerio';
import axios from 'axios';
interface Product {
name: string;
price: number;
description: string;
image: string;
url: string;
}
async function scrapeProducts(url: string): Promise<Product[]> {
const { data: html } = await axios.get(url);
const $ = cheerio.load(html);
const products: Product[] = [];
$('.product-card').each((index, element) => {
const $el = $(element);
products.push({
name: $el.find('.product-name').text().trim(),
price: parseFloat(
$el.find('.price')
.text()
.replace('€', '')
.replace(',', '.')
.trim()
),
description: $el.find('.description').text().trim(),
image: $el.find('img').attr('src') || '',
url: $el.find('a.product-link').attr('href') || ''
});
});
return products;
}
// Tabellen extrahieren
function extractTable($: cheerio.CheerioAPI, selector: string) {
const headers: string[] = [];
const rows: Record<string, string>[] = [];
// Header extrahieren
$(`${selector} thead th`).each((i, el) => {
headers.push($(el).text().trim());
});
// Rows extrahieren
$(`${selector} tbody tr`).each((i, row) => {
const rowData: Record<string, string> = {};
$(row).find('td').each((j, cell) => {
const header = headers[j] || `col${j}`;
rowData[header] = $(cell).text().trim();
});
rows.push(rowData);
});
return rows;
}
// Strukturierte Daten (JSON-LD)
function extractJsonLd($: cheerio.CheerioAPI) {
const scripts = $('script[type="application/ld+json"]');
const data: any[] = [];
scripts.each((i, el) => {
try {
const content = $(el).html();
if (content) {
data.push(JSON.parse(content));
}
} catch {}
});
return data;
}HTML Manipulation
import * as cheerio from 'cheerio';
const $ = cheerio.load('<div><p>Hello</p></div>');
// Text ändern
$('p').text('Hello World');
// HTML ändern
$('div').html('<span>New Content</span>');
// Elemente hinzufügen
$('div').append('<p>Appended</p>');
$('div').prepend('<p>Prepended</p>');
$('p').after('<span>After</span>');
$('p').before('<span>Before</span>');
// Attribute setzen
$('div').attr('id', 'container');
$('div').addClass('active');
$('div').removeClass('hidden');
// Elemente entfernen
$('.ads').remove();
$('.tracking').empty();
// Wrap
$('p').wrap('<section></section>');
// Clone
const cloned = $('p').clone();
$('div').append(cloned);
// Output
const finalHtml = $.html();
const bodyOnly = $('body').html();HTML Sanitization
import * as cheerio from 'cheerio';
function sanitizeHtml(dirtyHtml: string): string {
const $ = cheerio.load(dirtyHtml);
// Scripts entfernen
$('script').remove();
// Event Handler entfernen
$('*').each((i, el) => {
const element = $(el);
const attrs = (el as any).attribs || {};
Object.keys(attrs).forEach(attr => {
if (attr.startsWith('on')) {
element.removeAttr(attr);
}
});
});
// Gefährliche Attribute entfernen
$('a[href^="javascript:"]').removeAttr('href');
$('*[style*="expression"]').removeAttr('style');
// Iframe entfernen
$('iframe').remove();
// Erlaubte Tags whitelist
const allowedTags = ['p', 'a', 'b', 'i', 'u', 'ul', 'ol', 'li', 'br', 'h1', 'h2', 'h3'];
$('*').each((i, el) => {
const tagName = (el as any).tagName?.toLowerCase();
if (tagName && !allowedTags.includes(tagName) && tagName !== 'html' && tagName !== 'body') {
$(el).replaceWith($(el).html() || '');
}
});
return $('body').html() || '';
}
// Email-safe HTML
function makeEmailSafe(html: string): string {
const $ = cheerio.load(html);
// Relative URLs zu Absoluten
$('img[src^="/"]').each((i, el) => {
const src = $(el).attr('src');
$(el).attr('src', `https://example.com${src}`);
});
// CSS Inlinen (vereinfacht)
$('*[class]').each((i, el) => {
const element = $(el);
const classes = element.attr('class');
// Hier würde man CSS Styles inlinen
element.removeAttr('class');
});
return $.html();
}Pagination & Crawling
import * as cheerio from 'cheerio';
import axios from 'axios';
interface CrawlResult {
url: string;
title: string;
links: string[];
}
async function crawlWithPagination(startUrl: string, maxPages: number = 10) {
const visited = new Set<string>();
const results: CrawlResult[] = [];
let currentUrl: string | null = startUrl;
let pageCount = 0;
while (currentUrl && pageCount < maxPages) {
if (visited.has(currentUrl)) break;
visited.add(currentUrl);
try {
const { data: html } = await axios.get(currentUrl, {
headers: { 'User-Agent': 'Mozilla/5.0' },
timeout: 10000
});
const $ = cheerio.load(html);
results.push({
url: currentUrl,
title: $('title').text(),
links: $('a[href]')
.map((i, el) => $(el).attr('href'))
.get()
.filter(href => href && !href.startsWith('#'))
});
// Nächste Seite finden
const nextLink = $('a.next, a[rel="next"], .pagination a:last-child')
.attr('href');
currentUrl = nextLink ? new URL(nextLink, currentUrl).toString() : null;
pageCount++;
// Rate Limiting
await new Promise(r => setTimeout(r, 1000));
} catch (error) {
console.error(`Error crawling ${currentUrl}:`, error);
break;
}
}
return results;
}
// Sitemap Parsing
async function parseSitemap(sitemapUrl: string): Promise<string[]> {
const { data: xml } = await axios.get(sitemapUrl);
const $ = cheerio.load(xml, { xmlMode: true });
const urls: string[] = [];
// Standard Sitemap
$('url loc').each((i, el) => {
urls.push($(el).text());
});
// Sitemap Index
$('sitemap loc').each((i, el) => {
urls.push($(el).text());
});
return urls;
}RSS/Atom Feed Parsing
import * as cheerio from 'cheerio';
import axios from 'axios';
interface FeedItem {
title: string;
link: string;
description: string;
pubDate: Date;
author?: string;
}
async function parseRssFeed(feedUrl: string): Promise<FeedItem[]> {
const { data: xml } = await axios.get(feedUrl);
const $ = cheerio.load(xml, { xmlMode: true });
const items: FeedItem[] = [];
// RSS 2.0
$('item').each((i, el) => {
const $item = $(el);
items.push({
title: $item.find('title').text(),
link: $item.find('link').text(),
description: $item.find('description').text(),
pubDate: new Date($item.find('pubDate').text()),
author: $item.find('author, dc\\:creator').text() || undefined
});
});
// Atom
$('entry').each((i, el) => {
const $entry = $(el);
items.push({
title: $entry.find('title').text(),
link: $entry.find('link').attr('href') || '',
description: $entry.find('summary, content').text(),
pubDate: new Date($entry.find('updated, published').text()),
author: $entry.find('author name').text() || undefined
});
});
return items;
}Performance-Optimierung
import * as cheerio from 'cheerio';
// Lazy Loading für große Dokumente
function processLargeHtml(html: string) {
const $ = cheerio.load(html, {
// Nur parsen was nötig ist
xml: false,
decodeEntities: true
});
// Selektiv arbeiten
const relevantSection = $('#main-content').html();
if (relevantSection) {
const $section = cheerio.load(relevantSection);
// Weiterverarbeiten...
}
}
// Batch Processing
async function batchScrape(urls: string[], concurrency: number = 5) {
const results: any[] = [];
for (let i = 0; i < urls.length; i += concurrency) {
const batch = urls.slice(i, i + concurrency);
const batchResults = await Promise.all(
batch.map(async (url) => {
const { data } = await axios.get(url);
const $ = cheerio.load(data);
return {
url,
title: $('title').text()
};
})
);
results.push(...batchResults);
// Rate Limiting zwischen Batches
await new Promise(r => setTimeout(r, 1000));
}
return results;
}Fazit
Cheerio ist ideal für:
- Static Content: HTML ohne JavaScript-Rendering
- Speed: Kein Browser-Overhead
- Serverless: Geringer Memory-Footprint
- Vertrautheit: jQuery-Syntax
Für dynamische Seiten kombiniere mit Playwright/Puppeteer.
Bildprompts
- "HTML document being parsed into structured data, document processing"
- "jQuery selector finding elements in page structure, DOM navigation"
- "Fast processing of multiple HTML pages, batch extraction"