Menu
Back to Blog
1 min read
Automatisierung

Playwright Web Scraping: Stealth & Performance Guide

Playwright für Web Scraping 2026. Stealth-Techniken, Anti-Bot Detection, Browser Fingerprinting und Performance-Optimierung.

PlaywrightWeb ScrapingStealthBrowser AutomationAnti-BotHeadless Browser
Playwright Web Scraping: Stealth & Performance Guide

Playwright Web Scraping: Stealth & Performance Guide

Meta-Description: Playwright für Web Scraping 2026. Stealth-Techniken, Anti-Bot Detection, Browser Fingerprinting und Performance-Optimierung.

Keywords: Playwright, Web Scraping, Stealth, Browser Automation, Anti-Bot, Headless Browser, Data Extraction


Einführung

Playwright ist das moderne Standard-Tool für Web Scraping. Mit WebSocket-basierter Kommunikation, Native Network Interception und Multi-Browser-Support bietet es alles für skalierbare Datenextraktion.


Playwright vs Alternativen

┌─────────────────────────────────────────────────────────────┐
│              SCRAPING TOOLS 2026                            │
├─────────────────────────────────────────────────────────────┤
│                                                             │
│  PLAYWRIGHT                                                 │
│  ├── WebSocket-First (schneller als HTTP)                  │
│  ├── Multi-Browser (Chrome, Firefox, WebKit)               │
│  ├── Native Request Interception                           │
│  ├── Auto-Waiting                                          │
│  └── Best for: Modern JS Sites, Stealth                    │
│                                                             │
│  PUPPETEER                                                  │
│  ├── Chrome DevTools Protocol                              │
│  ├── Chrome/Firefox Support                                │
│  ├── Größere Community                                     │
│  └── Best for: Chrome-specific Features                    │
│                                                             │
│  CHEERIO                                                    │
│  ├── Kein Browser (nur HTML Parsing)                       │
│  ├── Extrem schnell                                        │
│  ├── Niedrige Ressourcen                                   │
│  └── Best for: Static Sites                                │
│                                                             │
└─────────────────────────────────────────────────────────────┘

Basic Setup

npm install playwright
npx playwright install  # Browser installieren
// scraper.ts
import { chromium, Browser, Page } from 'playwright';

async function scrape() {
  const browser = await chromium.launch({
    headless: true  // 'new' ist jetzt default
  });

  const context = await browser.newContext({
    userAgent: 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
    viewport: { width: 1920, height: 1080 },
    locale: 'de-DE',
    timezoneId: 'Europe/Berlin'
  });

  const page = await context.newPage();

  try {
    await page.goto('https://example.com', {
      waitUntil: 'networkidle',
      timeout: 30000
    });

    // Scraping Logic
    const data = await page.evaluate(() => {
      return {
        title: document.title,
        headings: Array.from(document.querySelectorAll('h1, h2'))
          .map(h => h.textContent)
      };
    });

    return data;
  } finally {
    await browser.close();
  }
}

Stealth Mode

// playwright-extra für Stealth Plugins
import { chromium } from 'playwright-extra';
import stealth from 'puppeteer-extra-plugin-stealth';

chromium.use(stealth());

async function stealthScrape(url: string) {
  const browser = await chromium.launch({ headless: true });

  const context = await browser.newContext({
    // Realistische Browser-Einstellungen
    userAgent: getRandomUserAgent(),
    viewport: getRandomViewport(),
    locale: 'de-DE',
    timezoneId: 'Europe/Berlin',
    geolocation: { latitude: 52.52, longitude: 13.405 },
    permissions: ['geolocation'],

    // WebGL Fingerprint
    deviceScaleFactor: 1,
    hasTouch: false,
    isMobile: false
  });

  // WebDriver Flag entfernen
  await context.addInitScript(() => {
    Object.defineProperty(navigator, 'webdriver', {
      get: () => undefined
    });

    // Chrome-spezifische Properties
    Object.defineProperty(navigator, 'plugins', {
      get: () => [1, 2, 3, 4, 5]
    });

    Object.defineProperty(navigator, 'languages', {
      get: () => ['de-DE', 'de', 'en-US', 'en']
    });

    // Automation Detection Override
    delete (window as any).cdc_adoQpoasnfa76pfcZLmcfl_Array;
    delete (window as any).cdc_adoQpoasnfa76pfcZLmcfl_Promise;
    delete (window as any).cdc_adoQpoasnfa76pfcZLmcfl_Symbol;
  });

  const page = await context.newPage();
  await page.goto(url);

  return { page, browser, context };
}

// Random User Agents
function getRandomUserAgent(): string {
  const userAgents = [
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:121.0) Gecko/20100101 Firefox/121.0'
  ];
  return userAgents[Math.floor(Math.random() * userAgents.length)];
}

function getRandomViewport() {
  const viewports = [
    { width: 1920, height: 1080 },
    { width: 1366, height: 768 },
    { width: 1536, height: 864 },
    { width: 1440, height: 900 }
  ];
  return viewports[Math.floor(Math.random() * viewports.length)];
}

Network Interception

// Ressourcen blockieren für Speed
async function fastScrape(url: string) {
  const browser = await chromium.launch();
  const context = await browser.newContext();
  const page = await context.newPage();

  // Unnötige Ressourcen blockieren
  await page.route('**/*', (route) => {
    const resourceType = route.request().resourceType();

    if (['image', 'stylesheet', 'font', 'media'].includes(resourceType)) {
      return route.abort();
    }

    // Tracking Scripts blockieren
    const url = route.request().url();
    if (
      url.includes('analytics') ||
      url.includes('tracking') ||
      url.includes('ads')
    ) {
      return route.abort();
    }

    return route.continue();
  });

  await page.goto(url, { waitUntil: 'domcontentloaded' });
  return page;
}

// API Responses abfangen
async function interceptApi(page: Page) {
  const apiResponses: any[] = [];

  page.on('response', async (response) => {
    const url = response.url();

    if (url.includes('/api/') && response.ok()) {
      try {
        const json = await response.json();
        apiResponses.push({
          url,
          data: json
        });
      } catch {}
    }
  });

  return apiResponses;
}

// Request modifizieren
await page.route('**/api/**', (route) => {
  const headers = {
    ...route.request().headers(),
    'Authorization': 'Bearer token123',
    'X-Custom-Header': 'value'
  };

  route.continue({ headers });
});

Selektoren & Datenextraktion

// Moderne Selektoren
async function extractData(page: Page) {
  // CSS Selektoren
  const title = await page.locator('h1').textContent();

  // Text-basierte Selektoren
  const loginButton = page.getByRole('button', { name: 'Login' });
  const emailInput = page.getByLabel('E-Mail');
  const link = page.getByText('Mehr erfahren');

  // Multiple Elemente
  const prices = await page.locator('.price').allTextContents();

  // Attribute extrahieren
  const links = await page.locator('a').evaluateAll(
    (elements) => elements.map(el => ({
      href: el.getAttribute('href'),
      text: el.textContent?.trim()
    }))
  );

  // Tabellen scrapen
  const tableData = await page.evaluate(() => {
    const rows = document.querySelectorAll('table tbody tr');
    return Array.from(rows).map(row => {
      const cells = row.querySelectorAll('td');
      return Array.from(cells).map(cell => cell.textContent?.trim());
    });
  });

  // Strukturierte Daten (JSON-LD)
  const jsonLd = await page.evaluate(() => {
    const script = document.querySelector('script[type="application/ld+json"]');
    return script ? JSON.parse(script.textContent || '{}') : null;
  });

  return { title, prices, links, tableData, jsonLd };
}

// Warten auf dynamischen Content
async function waitForContent(page: Page) {
  // Auf Element warten
  await page.waitForSelector('.product-list', { state: 'visible' });

  // Auf Network Idle
  await page.waitForLoadState('networkidle');

  // Auf bestimmte Anzahl Elemente
  await page.locator('.product-card').first().waitFor();

  // Custom Condition
  await page.waitForFunction(() => {
    return document.querySelectorAll('.product-card').length >= 10;
  });
}

Pagination & Infinite Scroll

// Pagination
async function scrapePaginated(baseUrl: string, maxPages: number = 10) {
  const browser = await chromium.launch();
  const page = await browser.newPage();

  const allData: any[] = [];

  for (let i = 1; i <= maxPages; i++) {
    await page.goto(`${baseUrl}?page=${i}`);

    const pageData = await page.evaluate(() => {
      return Array.from(document.querySelectorAll('.item')).map(el => ({
        title: el.querySelector('.title')?.textContent,
        price: el.querySelector('.price')?.textContent
      }));
    });

    if (pageData.length === 0) break;  // Keine Daten mehr

    allData.push(...pageData);

    // Rate Limiting
    await page.waitForTimeout(1000 + Math.random() * 2000);
  }

  await browser.close();
  return allData;
}

// Infinite Scroll
async function scrapeInfiniteScroll(url: string, maxScrolls: number = 20) {
  const browser = await chromium.launch();
  const page = await browser.newPage();
  await page.goto(url);

  let previousHeight = 0;
  let scrollCount = 0;

  while (scrollCount < maxScrolls) {
    // Scroll to bottom
    await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));

    // Auf neue Inhalte warten
    await page.waitForTimeout(2000);

    const currentHeight = await page.evaluate(() => document.body.scrollHeight);

    if (currentHeight === previousHeight) {
      break;  // Keine neuen Inhalte
    }

    previousHeight = currentHeight;
    scrollCount++;
  }

  // Alle Daten extrahieren
  const data = await page.evaluate(() => {
    return Array.from(document.querySelectorAll('.item')).map(/* ... */);
  });

  await browser.close();
  return data;
}

Proxy & Session Management

// Proxy Setup
const browser = await chromium.launch({
  proxy: {
    server: 'http://proxy.example.com:8080',
    username: 'user',
    password: 'pass'
  }
});

// Rotating Proxies
async function withRotatingProxy(urls: string[]) {
  const proxies = [
    'http://proxy1.example.com:8080',
    'http://proxy2.example.com:8080',
    'http://proxy3.example.com:8080'
  ];

  for (const url of urls) {
    const proxy = proxies[Math.floor(Math.random() * proxies.length)];

    const browser = await chromium.launch({
      proxy: { server: proxy }
    });

    try {
      const page = await browser.newPage();
      await page.goto(url);
      // Scrape...
    } finally {
      await browser.close();
    }
  }
}

// Session/Cookie Persistence
async function persistSession() {
  // Session speichern
  const context = await browser.newContext();
  const page = await context.newPage();

  await page.goto('https://example.com/login');
  // Login durchführen...

  // Cookies speichern
  const cookies = await context.cookies();
  await fs.writeFile('cookies.json', JSON.stringify(cookies));

  // Session wiederherstellen
  const newContext = await browser.newContext();
  const savedCookies = JSON.parse(await fs.readFile('cookies.json', 'utf8'));
  await newContext.addCookies(savedCookies);
}

Parallel Scraping

import { chromium, Browser } from 'playwright';
import pLimit from 'p-limit';

async function parallelScrape(urls: string[], concurrency: number = 5) {
  const browser = await chromium.launch();
  const limit = pLimit(concurrency);

  const results = await Promise.all(
    urls.map(url =>
      limit(async () => {
        const context = await browser.newContext();
        const page = await context.newPage();

        try {
          await page.goto(url, { timeout: 30000 });
          const data = await extractData(page);
          return { url, data, success: true };
        } catch (error) {
          return { url, error: (error as Error).message, success: false };
        } finally {
          await context.close();
        }
      })
    )
  );

  await browser.close();
  return results;
}

Fazit

Playwright Web Scraping 2026:

  1. Stealth First: Anti-Detection von Anfang an
  2. Performance: Resource Blocking, Parallel Scraping
  3. Robustheit: Auto-Waiting, Retries, Error Handling
  4. Compliance: Robots.txt respektieren, Rate Limiting

Immer rechtliche Aspekte und Terms of Service beachten.


Bildprompts

  1. "Spider crawling through web pages extracting data, web scraping concept"
  2. "Browser automation with invisible robot, stealth scraping"
  3. "Multiple parallel processes scraping different websites, concurrent extraction"

Quellen