Data Extraction

This example covers patterns for extracting structured data from web pages, including tables, lists, product information, and handling pagination.

Basic Text Extraction

Extract simple text content:

import { Stepwright } from '@autowright/stepwright';

interface PageData {
  title: string;
  description: string;
  author: string;
  publishDate: string;
}

const script = Stepwright.create<PageData>('Text Extraction')
  .config({
    headless: true,
  })
  .data({
    title: '',
    description: '',
    author: '',
    publishDate: '',
  })

  .step('Navigate to article', async (ctx) => {
    await ctx.page.goto('https://example.com/article/123');
    await ctx.page.waitForSelector('article');
  })

  .step('Extract article metadata', async (ctx) => {
    // Extract text content
    ctx.data.title = await ctx.page.locator('h1.article-title').textContent() || '';
    ctx.data.description = await ctx.page.locator('meta[name="description"]').getAttribute('content') || '';
    ctx.data.author = await ctx.page.locator('.author-name').textContent() || '';
    ctx.data.publishDate = await ctx.page.locator('time.publish-date').getAttribute('datetime') || '';

    ctx.log('Title:', ctx.data.title);
    ctx.log('Author:', ctx.data.author);
    ctx.log('Date:', ctx.data.publishDate);
  })

  .step('Extract article body', async (ctx) => {
    // Get full article HTML
    const articleHtml = await ctx.page.locator('article .content').innerHTML();
    ctx.log('Article length:', articleHtml.length, 'chars');

    // Or get plain text
    const articleText = await ctx.page.locator('article .content').innerText();
    ctx.log('Word count:', articleText.split(/\s+/).length);
  });

const result = await script.run();

if (result.success) {
  console.log('Extracted data:', result.data);
}

Table Data Extraction

Extract data from HTML tables:

import { Stepwright } from '@autowright/stepwright';

interface TableData {
  headers: string[];
  rows: Record<string, string>[];
}

const script = Stepwright.create<TableData>('Table Extraction')
  .config({
    headless: true,
  })
  .data({
    headers: [],
    rows: [],
  })

  .step('Navigate to table page', async (ctx) => {
    await ctx.page.goto('https://example.com/data-table');
    await ctx.page.waitForSelector('table#data');
  })

  .step('Extract table headers', async (ctx) => {
    const headerCells = ctx.page.locator('table#data thead th');
    ctx.data.headers = await headerCells.allTextContents();

    ctx.log('Headers:', ctx.data.headers.join(', '));
  })

  .step('Extract table rows', async (ctx) => {
    const rows = ctx.page.locator('table#data tbody tr');
    const rowCount = await rows.count();

    for (let i = 0; i < rowCount; i++) {
      const cells = rows.nth(i).locator('td');
      const cellTexts = await cells.allTextContents();

      // Create object with header keys
      const rowData: Record<string, string> = {};
      ctx.data.headers.forEach((header, index) => {
        rowData[header] = cellTexts[index] || '';
      });

      ctx.data.rows.push(rowData);
    }

    ctx.log('Extracted rows:', ctx.data.rows.length);
  })

  .step('Log extracted data', async (ctx) => {
    for (const row of ctx.data.rows.slice(0, 5)) {
      ctx.log('Row:', JSON.stringify(row));
    }

    if (ctx.data.rows.length > 5) {
      ctx.log(`... and ${ctx.data.rows.length - 5} more rows`);
    }
  });

const result = await script.run();

if (result.success) {
  // Export to JSON
  console.log(JSON.stringify(result.data.rows, null, 2));
}

Product List Extraction

Extract e-commerce product data:

import { Stepwright } from '@autowright/stepwright';

interface Product {
  name: string;
  price: number;
  originalPrice?: number;
  rating: number;
  reviewCount: number;
  imageUrl: string;
  url: string;
  inStock: boolean;
}

interface ProductData {
  category: string;
  products: Product[];
  totalProducts: number;
}

const script = Stepwright.create<ProductData>('Product Extraction')
  .config({
    headless: true,
    defaultTimeout: 30000,
  })
  .data({
    category: 'Electronics',
    products: [],
    totalProducts: 0,
  })

  .step('Navigate to category', async (ctx) => {
    await ctx.page.goto(`https://example.com/category/${ctx.data.category.toLowerCase()}`);
    await ctx.page.waitForSelector('.product-grid');
  })

  .step('Get total count', async (ctx) => {
    const countText = await ctx.page.locator('.results-count').textContent();
    const match = countText?.match(/(\d+)/);
    ctx.data.totalProducts = match ? parseInt(match[1]) : 0;

    ctx.log('Total products in category:', ctx.data.totalProducts);
  })

  .step('Extract products from current page', async (ctx) => {
    const productCards = ctx.page.locator('.product-card');
    const count = await productCards.count();

    for (let i = 0; i < count; i++) {
      const card = productCards.nth(i);

      // Extract each field
      const name = await card.locator('.product-name').textContent() || '';

      // Parse price (remove currency symbol)
      const priceText = await card.locator('.current-price').textContent() || '0';
      const price = parseFloat(priceText.replace(/[^0-9.]/g, ''));

      // Check for sale price
      let originalPrice: number | undefined;
      const originalPriceEl = card.locator('.original-price');
      if (await originalPriceEl.count() > 0) {
        const origText = await originalPriceEl.textContent() || '0';
        originalPrice = parseFloat(origText.replace(/[^0-9.]/g, ''));
      }

      // Extract rating
      const ratingText = await card.locator('.rating').getAttribute('data-rating') || '0';
      const rating = parseFloat(ratingText);

      // Extract review count
      const reviewText = await card.locator('.review-count').textContent() || '0';
      const reviewCount = parseInt(reviewText.replace(/[^0-9]/g, '')) || 0;

      // Get image URL
      const imageUrl = await card.locator('img.product-image').getAttribute('src') || '';

      // Get product URL
      const url = await card.locator('a.product-link').getAttribute('href') || '';

      // Check stock status
      const stockBadge = card.locator('.stock-badge');
      const inStock = !(await stockBadge.locator('text=Out of Stock').count() > 0);

      ctx.data.products.push({
        name,
        price,
        originalPrice,
        rating,
        reviewCount,
        imageUrl,
        url: url.startsWith('/') ? `https://example.com${url}` : url,
        inStock,
      });
    }

    ctx.log('Extracted products from page:', count);
  })

  .step('Summary', async (ctx) => {
    ctx.log('Total extracted:', ctx.data.products.length);

    // Find best deals (biggest discount)
    const onSale = ctx.data.products.filter((p) => p.originalPrice);
    const bestDeal = onSale.sort((a, b) => {
      const discountA = ((a.originalPrice! - a.price) / a.originalPrice!) * 100;
      const discountB = ((b.originalPrice! - b.price) / b.originalPrice!) * 100;
      return discountB - discountA;
    })[0];

    if (bestDeal) {
      const discount = ((bestDeal.originalPrice! - bestDeal.price) / bestDeal.originalPrice!) * 100;
      ctx.log(`Best deal: ${bestDeal.name} - ${discount.toFixed(0)}% off`);
    }

    // Highest rated
    const highestRated = ctx.data.products.sort((a, b) => b.rating - a.rating)[0];
    ctx.log(`Highest rated: ${highestRated.name} - ${highestRated.rating} stars`);
  });

Pagination Handling

Extract data across multiple pages:

import { Stepwright } from '@autowright/stepwright';

interface Item {
  id: string;
  title: string;
}

interface PaginatedData {
  items: Item[];
  currentPage: number;
  totalPages: number;
}

const script = Stepwright.create<PaginatedData>('Paginated Extraction')
  .config({
    headless: true,
  })
  .data({
    items: [],
    currentPage: 1,
    totalPages: 0,
  })

  .step('Navigate to listing', async (ctx) => {
    await ctx.page.goto('https://example.com/listings');
    await ctx.page.waitForSelector('.listing-item');

    // Get total pages
    const paginationText = await ctx.page.locator('.pagination-info').textContent();
    const match = paginationText?.match(/Page \d+ of (\d+)/);
    ctx.data.totalPages = match ? parseInt(match[1]) : 1;

    ctx.log('Total pages:', ctx.data.totalPages);
  })

  .step('Extract all pages', async (ctx) => {
    while (ctx.data.currentPage <= ctx.data.totalPages) {
      ctx.log(`Processing page ${ctx.data.currentPage}/${ctx.data.totalPages}`);

      // Extract current page items
      const items = ctx.page.locator('.listing-item');
      const count = await items.count();

      for (let i = 0; i < count; i++) {
        const item = items.nth(i);
        const id = await item.getAttribute('data-id') || '';
        const title = await item.locator('.item-title').textContent() || '';

        ctx.data.items.push({ id, title });
      }

      ctx.log(`Extracted ${count} items from page ${ctx.data.currentPage}`);

      // Check for next page
      const nextButton = ctx.page.locator('.pagination-next:not([disabled])');
      if (await nextButton.count() > 0 && ctx.data.currentPage < ctx.data.totalPages) {
        // Wait for navigation
        const responsePromise = ctx.page.waitForResponse(
          (res) => res.url().includes('/listings') && res.status() === 200
        );

        await nextButton.click();
        await responsePromise;

        ctx.data.currentPage++;

        // Wait for new content
        await ctx.page.waitForSelector('.listing-item');
      } else {
        break;
      }
    }

    ctx.log('Total items extracted:', ctx.data.items.length);
  });

Infinite Scroll Extraction

Handle infinite scroll pages:

import { Stepwright } from '@autowright/stepwright';

interface ScrollData {
  items: string[];
  scrollCount: number;
  maxScrolls: number;
}

const script = Stepwright.create<ScrollData>('Infinite Scroll Extraction')
  .config({
    headless: true,
  })
  .data({
    items: [],
    scrollCount: 0,
    maxScrolls: 10, // Limit scrolls
  })

  .step('Navigate to feed', async (ctx) => {
    await ctx.page.goto('https://example.com/feed');
    await ctx.page.waitForSelector('.feed-item');
  })

  .step('Scroll and extract', async (ctx) => {
    let previousHeight = 0;
    let noNewContent = 0;

    while (ctx.data.scrollCount < ctx.data.maxScrolls) {
      // Get current height
      const currentHeight = await ctx.page.evaluate(() => document.body.scrollHeight);

      // Extract visible items (only new ones)
      const allItems = await ctx.page.locator('.feed-item').allTextContents();
      const newItems = allItems.filter((item) => !ctx.data.items.includes(item));

      if (newItems.length > 0) {
        ctx.data.items.push(...newItems);
        ctx.log(`Found ${newItems.length} new items`);
        noNewContent = 0;
      } else {
        noNewContent++;
        if (noNewContent >= 3) {
          ctx.log('No new content after 3 scrolls, stopping');
          break;
        }
      }

      // Scroll to bottom
      await ctx.page.evaluate(() => {
        window.scrollTo(0, document.body.scrollHeight);
      });

      // Wait for potential new content
      await ctx.page.waitForTimeout(1000);

      // Check if page grew
      const newHeight = await ctx.page.evaluate(() => document.body.scrollHeight);
      if (newHeight === previousHeight && newHeight === currentHeight) {
        ctx.log('Reached end of content');
        break;
      }

      previousHeight = currentHeight;
      ctx.data.scrollCount++;
    }

    ctx.log('Total items extracted:', ctx.data.items.length);
    ctx.log('Scroll operations:', ctx.data.scrollCount);
  });

Structured Data Extraction (JSON-LD)

Extract structured data from page:

import { Stepwright } from '@autowright/stepwright';

interface StructuredData {
  jsonLd: Record<string, unknown>[];
  openGraph: Record<string, string>;
  microdata: Record<string, string>[];
}

const script = Stepwright.create<StructuredData>('Structured Data Extraction')
  .config({
    headless: true,
  })
  .data({
    jsonLd: [],
    openGraph: {},
    microdata: [],
  })

  .step('Navigate to page', async (ctx) => {
    await ctx.page.goto('https://example.com/product/123');
  })

  .step('Extract JSON-LD', async (ctx) => {
    const jsonLdScripts = ctx.page.locator('script[type="application/ld+json"]');
    const count = await jsonLdScripts.count();

    for (let i = 0; i < count; i++) {
      const content = await jsonLdScripts.nth(i).textContent();
      if (content) {
        try {
          const data = JSON.parse(content);
          ctx.data.jsonLd.push(data);
        } catch (e) {
          ctx.log('Failed to parse JSON-LD:', e);
        }
      }
    }

    ctx.log('JSON-LD objects found:', ctx.data.jsonLd.length);

    // Log product info if present
    const productData = ctx.data.jsonLd.find((d) => d['@type'] === 'Product');
    if (productData) {
      ctx.log('Product name:', productData.name);
      ctx.log('Product price:', (productData.offers as Record<string, unknown>)?.price);
    }
  })

  .step('Extract Open Graph meta tags', async (ctx) => {
    const ogTags = ctx.page.locator('meta[property^="og:"]');
    const count = await ogTags.count();

    for (let i = 0; i < count; i++) {
      const tag = ogTags.nth(i);
      const property = await tag.getAttribute('property');
      const content = await tag.getAttribute('content');

      if (property && content) {
        ctx.data.openGraph[property] = content;
      }
    }

    ctx.log('Open Graph tags:', Object.keys(ctx.data.openGraph).length);
    ctx.log('OG Title:', ctx.data.openGraph['og:title']);
  })

  .step('Extract microdata', async (ctx) => {
    // Extract itemscope elements
    const itemScopes = await ctx.page.evaluate(() => {
      const elements = document.querySelectorAll('[itemscope]');
      return Array.from(elements).map((el) => ({
        type: el.getAttribute('itemtype') || '',
        props: Array.from(el.querySelectorAll('[itemprop]')).map((prop) => ({
          name: prop.getAttribute('itemprop'),
          value: prop.textContent?.trim() || prop.getAttribute('content'),
        })),
      }));
    });

    ctx.data.microdata = itemScopes as Record<string, string>[];
    ctx.log('Microdata items:', ctx.data.microdata.length);
  });

API Response Extraction

Extract data from API calls made by the page:

import { Stepwright } from '@autowright/stepwright';

interface ApiData {
  apiResponses: Array<{
    url: string;
    data: unknown;
  }>;
}

const script = Stepwright.create<ApiData>('API Extraction')
  .config({
    headless: true,
  })
  .data({
    apiResponses: [],
  })

  .step('Setup API interception', async (ctx) => {
    // Listen for API responses
    ctx.page.on('response', async (response) => {
      const url = response.url();

      // Only capture API calls
      if (url.includes('/api/')) {
        try {
          const data = await response.json();
          ctx.data.apiResponses.push({ url, data });
          ctx.log('Captured API:', url);
        } catch {
          // Not JSON response, skip
        }
      }
    });

    ctx.log('API interception setup');
  })

  .step('Navigate and trigger API calls', async (ctx) => {
    await ctx.page.goto('https://example.com/dashboard');
    await ctx.page.waitForLoadState('networkidle');

    ctx.log('Page loaded, API calls captured:', ctx.data.apiResponses.length);
  })

  .step('Trigger more API calls', async (ctx) => {
    // Click to load more data
    await ctx.page.click('#load-more');
    await ctx.page.waitForLoadState('networkidle');

    ctx.log('Total API calls captured:', ctx.data.apiResponses.length);

    // Log captured data
    for (const { url, data } of ctx.data.apiResponses) {
      ctx.log('API:', url);
      ctx.log('Data:', JSON.stringify(data).slice(0, 100) + '...');
    }
  });

Running the Examples

# Basic extraction
npx tsx scripts/text-extraction.ts

# Save extracted data to file
npx tsx scripts/product-extraction.ts > products.json

# Run with logging
npx stepwright run scripts/table-extraction.ts --verbose

Key Takeaways

Use specific selectors to target data elements
Handle missing data gracefully with defaults
Parse numeric values carefully (remove currency symbols, etc.)
Intercept API calls when possible for cleaner data
Implement pagination for complete data extraction

Next Steps

Multi-Page Workflow - Complex data collection flows
Handling Dynamic Content - AJAX-loaded data
Best Practices - Data extraction tips