Skip to content

Data Extraction

Data Extraction

This example covers patterns for extracting structured data from web pages, including tables, lists, product information, and handling pagination.

Basic Text Extraction

Extract simple text content:

scripts/text-extraction.ts
import { Stepwright } from '@korvol/stepwright';
interface PageData {
title: string;
description: string;
author: string;
publishDate: string;
}
const script = Stepwright.create<PageData>('Text Extraction')
.config({
headless: true,
})
.data({
title: '',
description: '',
author: '',
publishDate: '',
})
.step('Navigate to article', async (ctx) => {
await ctx.page.goto('https://example.com/article/123');
await ctx.page.waitForSelector('article');
})
.step('Extract article metadata', async (ctx) => {
// Extract text content
ctx.data.title = await ctx.page.locator('h1.article-title').textContent() || '';
ctx.data.description = await ctx.page.locator('meta[name="description"]').getAttribute('content') || '';
ctx.data.author = await ctx.page.locator('.author-name').textContent() || '';
ctx.data.publishDate = await ctx.page.locator('time.publish-date').getAttribute('datetime') || '';
ctx.log('Title:', ctx.data.title);
ctx.log('Author:', ctx.data.author);
ctx.log('Date:', ctx.data.publishDate);
})
.step('Extract article body', async (ctx) => {
// Get full article HTML
const articleHtml = await ctx.page.locator('article .content').innerHTML();
ctx.log('Article length:', articleHtml.length, 'chars');
// Or get plain text
const articleText = await ctx.page.locator('article .content').innerText();
ctx.log('Word count:', articleText.split(/\s+/).length);
});
const result = await script.run();
if (result.success) {
console.log('Extracted data:', result.data);
}

Table Data Extraction

Extract data from HTML tables:

scripts/table-extraction.ts
import { Stepwright } from '@korvol/stepwright';
interface TableData {
headers: string[];
rows: Record<string, string>[];
}
const script = Stepwright.create<TableData>('Table Extraction')
.config({
headless: true,
})
.data({
headers: [],
rows: [],
})
.step('Navigate to table page', async (ctx) => {
await ctx.page.goto('https://example.com/data-table');
await ctx.page.waitForSelector('table#data');
})
.step('Extract table headers', async (ctx) => {
const headerCells = ctx.page.locator('table#data thead th');
ctx.data.headers = await headerCells.allTextContents();
ctx.log('Headers:', ctx.data.headers.join(', '));
})
.step('Extract table rows', async (ctx) => {
const rows = ctx.page.locator('table#data tbody tr');
const rowCount = await rows.count();
for (let i = 0; i < rowCount; i++) {
const cells = rows.nth(i).locator('td');
const cellTexts = await cells.allTextContents();
// Create object with header keys
const rowData: Record<string, string> = {};
ctx.data.headers.forEach((header, index) => {
rowData[header] = cellTexts[index] || '';
});
ctx.data.rows.push(rowData);
}
ctx.log('Extracted rows:', ctx.data.rows.length);
})
.step('Log extracted data', async (ctx) => {
for (const row of ctx.data.rows.slice(0, 5)) {
ctx.log('Row:', JSON.stringify(row));
}
if (ctx.data.rows.length > 5) {
ctx.log(`... and ${ctx.data.rows.length - 5} more rows`);
}
});
const result = await script.run();
if (result.success) {
// Export to JSON
console.log(JSON.stringify(result.data.rows, null, 2));
}

Product List Extraction

Extract e-commerce product data:

scripts/product-extraction.ts
import { Stepwright } from '@korvol/stepwright';
interface Product {
name: string;
price: number;
originalPrice?: number;
rating: number;
reviewCount: number;
imageUrl: string;
url: string;
inStock: boolean;
}
interface ProductData {
category: string;
products: Product[];
totalProducts: number;
}
const script = Stepwright.create<ProductData>('Product Extraction')
.config({
headless: true,
defaultTimeout: 30000,
})
.data({
category: 'Electronics',
products: [],
totalProducts: 0,
})
.step('Navigate to category', async (ctx) => {
await ctx.page.goto(`https://example.com/category/${ctx.data.category.toLowerCase()}`);
await ctx.page.waitForSelector('.product-grid');
})
.step('Get total count', async (ctx) => {
const countText = await ctx.page.locator('.results-count').textContent();
const match = countText?.match(/(\d+)/);
ctx.data.totalProducts = match ? parseInt(match[1]) : 0;
ctx.log('Total products in category:', ctx.data.totalProducts);
})
.step('Extract products from current page', async (ctx) => {
const productCards = ctx.page.locator('.product-card');
const count = await productCards.count();
for (let i = 0; i < count; i++) {
const card = productCards.nth(i);
// Extract each field
const name = await card.locator('.product-name').textContent() || '';
// Parse price (remove currency symbol)
const priceText = await card.locator('.current-price').textContent() || '0';
const price = parseFloat(priceText.replace(/[^0-9.]/g, ''));
// Check for sale price
let originalPrice: number | undefined;
const originalPriceEl = card.locator('.original-price');
if (await originalPriceEl.count() > 0) {
const origText = await originalPriceEl.textContent() || '0';
originalPrice = parseFloat(origText.replace(/[^0-9.]/g, ''));
}
// Extract rating
const ratingText = await card.locator('.rating').getAttribute('data-rating') || '0';
const rating = parseFloat(ratingText);
// Extract review count
const reviewText = await card.locator('.review-count').textContent() || '0';
const reviewCount = parseInt(reviewText.replace(/[^0-9]/g, '')) || 0;
// Get image URL
const imageUrl = await card.locator('img.product-image').getAttribute('src') || '';
// Get product URL
const url = await card.locator('a.product-link').getAttribute('href') || '';
// Check stock status
const stockBadge = card.locator('.stock-badge');
const inStock = !(await stockBadge.locator('text=Out of Stock').count() > 0);
ctx.data.products.push({
name,
price,
originalPrice,
rating,
reviewCount,
imageUrl,
url: url.startsWith('/') ? `https://example.com${url}` : url,
inStock,
});
}
ctx.log('Extracted products from page:', count);
})
.step('Summary', async (ctx) => {
ctx.log('Total extracted:', ctx.data.products.length);
// Find best deals (biggest discount)
const onSale = ctx.data.products.filter((p) => p.originalPrice);
const bestDeal = onSale.sort((a, b) => {
const discountA = ((a.originalPrice! - a.price) / a.originalPrice!) * 100;
const discountB = ((b.originalPrice! - b.price) / b.originalPrice!) * 100;
return discountB - discountA;
})[0];
if (bestDeal) {
const discount = ((bestDeal.originalPrice! - bestDeal.price) / bestDeal.originalPrice!) * 100;
ctx.log(`Best deal: ${bestDeal.name} - ${discount.toFixed(0)}% off`);
}
// Highest rated
const highestRated = ctx.data.products.sort((a, b) => b.rating - a.rating)[0];
ctx.log(`Highest rated: ${highestRated.name} - ${highestRated.rating} stars`);
});

Pagination Handling

Extract data across multiple pages:

scripts/paginated-extraction.ts
import { Stepwright } from '@korvol/stepwright';
interface Item {
id: string;
title: string;
}
interface PaginatedData {
items: Item[];
currentPage: number;
totalPages: number;
}
const script = Stepwright.create<PaginatedData>('Paginated Extraction')
.config({
headless: true,
})
.data({
items: [],
currentPage: 1,
totalPages: 0,
})
.step('Navigate to listing', async (ctx) => {
await ctx.page.goto('https://example.com/listings');
await ctx.page.waitForSelector('.listing-item');
// Get total pages
const paginationText = await ctx.page.locator('.pagination-info').textContent();
const match = paginationText?.match(/Page \d+ of (\d+)/);
ctx.data.totalPages = match ? parseInt(match[1]) : 1;
ctx.log('Total pages:', ctx.data.totalPages);
})
.step('Extract all pages', async (ctx) => {
while (ctx.data.currentPage <= ctx.data.totalPages) {
ctx.log(`Processing page ${ctx.data.currentPage}/${ctx.data.totalPages}`);
// Extract current page items
const items = ctx.page.locator('.listing-item');
const count = await items.count();
for (let i = 0; i < count; i++) {
const item = items.nth(i);
const id = await item.getAttribute('data-id') || '';
const title = await item.locator('.item-title').textContent() || '';
ctx.data.items.push({ id, title });
}
ctx.log(`Extracted ${count} items from page ${ctx.data.currentPage}`);
// Check for next page
const nextButton = ctx.page.locator('.pagination-next:not([disabled])');
if (await nextButton.count() > 0 && ctx.data.currentPage < ctx.data.totalPages) {
// Wait for navigation
const responsePromise = ctx.page.waitForResponse(
(res) => res.url().includes('/listings') && res.status() === 200
);
await nextButton.click();
await responsePromise;
ctx.data.currentPage++;
// Wait for new content
await ctx.page.waitForSelector('.listing-item');
} else {
break;
}
}
ctx.log('Total items extracted:', ctx.data.items.length);
});

Infinite Scroll Extraction

Handle infinite scroll pages:

scripts/infinite-scroll.ts
import { Stepwright } from '@korvol/stepwright';
interface ScrollData {
items: string[];
scrollCount: number;
maxScrolls: number;
}
const script = Stepwright.create<ScrollData>('Infinite Scroll Extraction')
.config({
headless: true,
})
.data({
items: [],
scrollCount: 0,
maxScrolls: 10, // Limit scrolls
})
.step('Navigate to feed', async (ctx) => {
await ctx.page.goto('https://example.com/feed');
await ctx.page.waitForSelector('.feed-item');
})
.step('Scroll and extract', async (ctx) => {
let previousHeight = 0;
let noNewContent = 0;
while (ctx.data.scrollCount < ctx.data.maxScrolls) {
// Get current height
const currentHeight = await ctx.page.evaluate(() => document.body.scrollHeight);
// Extract visible items (only new ones)
const allItems = await ctx.page.locator('.feed-item').allTextContents();
const newItems = allItems.filter((item) => !ctx.data.items.includes(item));
if (newItems.length > 0) {
ctx.data.items.push(...newItems);
ctx.log(`Found ${newItems.length} new items`);
noNewContent = 0;
} else {
noNewContent++;
if (noNewContent >= 3) {
ctx.log('No new content after 3 scrolls, stopping');
break;
}
}
// Scroll to bottom
await ctx.page.evaluate(() => {
window.scrollTo(0, document.body.scrollHeight);
});
// Wait for potential new content
await ctx.page.waitForTimeout(1000);
// Check if page grew
const newHeight = await ctx.page.evaluate(() => document.body.scrollHeight);
if (newHeight === previousHeight && newHeight === currentHeight) {
ctx.log('Reached end of content');
break;
}
previousHeight = currentHeight;
ctx.data.scrollCount++;
}
ctx.log('Total items extracted:', ctx.data.items.length);
ctx.log('Scroll operations:', ctx.data.scrollCount);
});

Structured Data Extraction (JSON-LD)

Extract structured data from page:

scripts/structured-data.ts
import { Stepwright } from '@korvol/stepwright';
interface StructuredData {
jsonLd: Record<string, unknown>[];
openGraph: Record<string, string>;
microdata: Record<string, string>[];
}
const script = Stepwright.create<StructuredData>('Structured Data Extraction')
.config({
headless: true,
})
.data({
jsonLd: [],
openGraph: {},
microdata: [],
})
.step('Navigate to page', async (ctx) => {
await ctx.page.goto('https://example.com/product/123');
})
.step('Extract JSON-LD', async (ctx) => {
const jsonLdScripts = ctx.page.locator('script[type="application/ld+json"]');
const count = await jsonLdScripts.count();
for (let i = 0; i < count; i++) {
const content = await jsonLdScripts.nth(i).textContent();
if (content) {
try {
const data = JSON.parse(content);
ctx.data.jsonLd.push(data);
} catch (e) {
ctx.log('Failed to parse JSON-LD:', e);
}
}
}
ctx.log('JSON-LD objects found:', ctx.data.jsonLd.length);
// Log product info if present
const productData = ctx.data.jsonLd.find((d) => d['@type'] === 'Product');
if (productData) {
ctx.log('Product name:', productData.name);
ctx.log('Product price:', (productData.offers as Record<string, unknown>)?.price);
}
})
.step('Extract Open Graph meta tags', async (ctx) => {
const ogTags = ctx.page.locator('meta[property^="og:"]');
const count = await ogTags.count();
for (let i = 0; i < count; i++) {
const tag = ogTags.nth(i);
const property = await tag.getAttribute('property');
const content = await tag.getAttribute('content');
if (property && content) {
ctx.data.openGraph[property] = content;
}
}
ctx.log('Open Graph tags:', Object.keys(ctx.data.openGraph).length);
ctx.log('OG Title:', ctx.data.openGraph['og:title']);
})
.step('Extract microdata', async (ctx) => {
// Extract itemscope elements
const itemScopes = await ctx.page.evaluate(() => {
const elements = document.querySelectorAll('[itemscope]');
return Array.from(elements).map((el) => ({
type: el.getAttribute('itemtype') || '',
props: Array.from(el.querySelectorAll('[itemprop]')).map((prop) => ({
name: prop.getAttribute('itemprop'),
value: prop.textContent?.trim() || prop.getAttribute('content'),
})),
}));
});
ctx.data.microdata = itemScopes as Record<string, string>[];
ctx.log('Microdata items:', ctx.data.microdata.length);
});

API Response Extraction

Extract data from API calls made by the page:

scripts/api-extraction.ts
import { Stepwright } from '@korvol/stepwright';
interface ApiData {
apiResponses: Array<{
url: string;
data: unknown;
}>;
}
const script = Stepwright.create<ApiData>('API Extraction')
.config({
headless: true,
})
.data({
apiResponses: [],
})
.step('Setup API interception', async (ctx) => {
// Listen for API responses
ctx.page.on('response', async (response) => {
const url = response.url();
// Only capture API calls
if (url.includes('/api/')) {
try {
const data = await response.json();
ctx.data.apiResponses.push({ url, data });
ctx.log('Captured API:', url);
} catch {
// Not JSON response, skip
}
}
});
ctx.log('API interception setup');
})
.step('Navigate and trigger API calls', async (ctx) => {
await ctx.page.goto('https://example.com/dashboard');
await ctx.page.waitForLoadState('networkidle');
ctx.log('Page loaded, API calls captured:', ctx.data.apiResponses.length);
})
.step('Trigger more API calls', async (ctx) => {
// Click to load more data
await ctx.page.click('#load-more');
await ctx.page.waitForLoadState('networkidle');
ctx.log('Total API calls captured:', ctx.data.apiResponses.length);
// Log captured data
for (const { url, data } of ctx.data.apiResponses) {
ctx.log('API:', url);
ctx.log('Data:', JSON.stringify(data).slice(0, 100) + '...');
}
});

Running the Examples

Terminal window
# Basic extraction
npx tsx scripts/text-extraction.ts
# Save extracted data to file
npx tsx scripts/product-extraction.ts > products.json
# Run with logging
npx stepwright run scripts/table-extraction.ts --verbose

Key Takeaways

  1. Use specific selectors to target data elements
  2. Handle missing data gracefully with defaults
  3. Parse numeric values carefully (remove currency symbols, etc.)
  4. Intercept API calls when possible for cleaner data
  5. Implement pagination for complete data extraction

Next Steps