requests library for making HTTP requests.
Copy
pip install requests
Setup
Copy
import requests
API_KEY = 'ck_your_api_key'
BASE_URL = 'https://api.crawlkit.com/api/v1'
def crawlkit(endpoint, body):
"""Helper function for all CrawlKit requests."""
response = requests.post(
f'{BASE_URL}{endpoint}',
headers={
'Authorization': f'ApiKey {API_KEY}',
'Content-Type': 'application/json',
},
json=body
)
return response.json()
Raw Crawl
Fetch a webpage and get its HTML content.Basic Request
Copy
result = crawlkit('/crawl/raw', {
'url': 'https://example.com'
})
print(result['data']['body']) # HTML content
print(result['data']['statusCode']) # 200
With Options
Copy
result = crawlkit('/crawl/raw', {
'url': 'https://example.com',
'options': {
'timeout': 30000,
'followRedirects': True,
'maxRedirects': 5,
'headers': {
'User-Agent': 'MyApp/1.0'
}
}
})
Complete Example
Copy
def fetch_page(url, **options):
"""Fetch a webpage and return its HTML content."""
result = crawlkit('/crawl/raw', {
'url': url,
'options': options
})
if not result['success']:
raise Exception(result['error']['message'])
data = result['data']
print(f"Fetched {data['finalUrl']}")
print(f"Status: {data['statusCode']}")
print(f"Size: {data['contentLength']} bytes")
print(f"Time: {data['timing']['total']}ms")
print(f"Credits remaining: {data['creditsRemaining']}")
return data['body']
# Usage
html = fetch_page('https://example.com')
Web Search
Search the web using DuckDuckGo.Basic Search
Copy
result = crawlkit('/crawl/search', {
'query': 'web scraping python'
})
for item in result['data']['results']:
print(f"{item['position']}. {item['title']}")
print(f" {item['url']}")
print(f" {item['snippet']}\n")
With Filters
Copy
result = crawlkit('/crawl/search', {
'query': 'web scraping python',
'options': {
'language': 'en-US',
'region': 'us-en',
'timeRange': 'm', # Last month
'maxResults': 20
}
})
Complete Example
Copy
def search(query, **options):
"""Search the web and return results."""
result = crawlkit('/crawl/search', {
'query': query,
'options': options
})
if not result['success']:
raise Exception(result['error']['message'])
data = result['data']
print(f"Found {data['totalResults']} results for '{query}'")
return [
{
'title': r['title'],
'url': r['url'],
'snippet': r['snippet']
}
for r in data['results']
]
# Usage
results = search('python tutorial', maxResults=10)
for r in results:
print(r['title'])
Screenshot
Take a full-page screenshot of any website.Basic Screenshot
Copy
result = crawlkit('/crawl/screenshot', {
'url': 'https://example.com'
})
print('Screenshot URL:', result['data']['url'])
With Options
Copy
result = crawlkit('/crawl/screenshot', {
'url': 'https://example.com',
'options': {
'width': 1920,
'height': 1080,
'timeout': 30000,
'waitForSelector': '.content-loaded'
}
})
Download Screenshot
Copy
def take_screenshot(url, filename, **options):
"""Take a screenshot and save it to a file."""
result = crawlkit('/crawl/screenshot', {
'url': url,
'options': options
})
if not result['success']:
raise Exception(result['error']['message'])
# Download the image
image_url = result['data']['url']
response = requests.get(image_url)
with open(filename, 'wb') as f:
f.write(response.content)
print(f'Saved to {filename}')
return filename
# Usage
take_screenshot('https://example.com', 'screenshot.png')
Error Handling
Copy
def safe_crawl(url):
"""Crawl a URL with proper error handling."""
result = crawlkit('/crawl/raw', {'url': url})
if not result['success']:
error = result['error']
code = error['code']
message = error['message']
if code == 'INSUFFICIENT_CREDITS':
print('Out of credits! Please purchase more.')
elif code == 'INVALID_URL':
print('Invalid URL provided.')
elif code == 'TIMEOUT':
print('Request timed out. Try increasing timeout.')
else:
print(f'Error: {message}')
return None
return result['data']
Using Environment Variables
Copy
import os
API_KEY = os.environ.get('CRAWLKIT_API_KEY')
if not API_KEY:
raise Exception('CRAWLKIT_API_KEY environment variable not set')
Copy
export CRAWLKIT_API_KEY="ck_your_api_key"
python your_script.py
Batch Processing
Copy
from concurrent.futures import ThreadPoolExecutor, as_completed
def crawl_multiple(urls, max_workers=5):
"""Crawl multiple URLs in parallel."""
results = {}
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_url = {
executor.submit(fetch_page, url): url
for url in urls
}
for future in as_completed(future_to_url):
url = future_to_url[future]
try:
results[url] = future.result()
except Exception as e:
results[url] = f'Error: {e}'
return results
# Usage
urls = [
'https://example.com',
'https://httpbin.org/html',
'https://jsonplaceholder.typicode.com'
]
results = crawl_multiple(urls)
for url, html in results.items():
print(f'{url}: {len(html)} bytes')