From 6d27ad5744a0accee4273ea43e998a23d56ad009 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 9 Feb 2026 16:53:34 +0300 Subject: [PATCH 1/5] Add Datasets API with 8 datasets and demo notebooks - Datasets: LinkedIn (profiles, companies), Amazon, Crunchbase, IMDB, NBA, Goodreads, World Population - Export utilities: export_json, export_csv, export_jsonl - Notebooks: linkedin, amazon, crunchbase demos --- CHANGELOG.md | 34 + LICENSE | 1 - notebooks/datasets/amazon/amazon.ipynb | 490 ++++++++++++ .../datasets/crunchbase/crunchbase.ipynb | 588 ++++++++++++++ notebooks/datasets/linkedin/linkedin.ipynb | 754 ++++++++++++++++++ pyproject.toml | 2 +- requirements.txt | 1 - src/brightdata/cli/README.md | 1 - src/brightdata/cli/banner.py | 12 +- src/brightdata/client.py | 31 + src/brightdata/datasets/__init__.py | 52 ++ src/brightdata/datasets/amazon/__init__.py | 5 + src/brightdata/datasets/amazon/products.py | 412 ++++++++++ src/brightdata/datasets/base.py | 221 +++++ src/brightdata/datasets/client.py | 136 ++++ .../datasets/crunchbase/__init__.py | 5 + .../datasets/crunchbase/companies.py | 602 ++++++++++++++ src/brightdata/datasets/goodreads/__init__.py | 5 + src/brightdata/datasets/goodreads/books.py | 121 +++ src/brightdata/datasets/imdb/__init__.py | 5 + src/brightdata/datasets/imdb/movies.py | 195 +++++ src/brightdata/datasets/linkedin/__init__.py | 6 + .../datasets/linkedin/company_profiles.py | 197 +++++ .../datasets/linkedin/people_profiles.py | 285 +++++++ src/brightdata/datasets/models.py | 73 ++ src/brightdata/datasets/nba/__init__.py | 5 + src/brightdata/datasets/nba/players_stats.py | 136 ++++ src/brightdata/datasets/utils.py | 139 ++++ .../datasets/world_population/__init__.py | 5 + .../datasets/world_population/countries.py | 155 ++++ src/brightdata/utils/ssl_helpers.py | 4 +- 31 files changed, 4666 insertions(+), 12 deletions(-) create mode 100644 notebooks/datasets/amazon/amazon.ipynb create mode 100644 notebooks/datasets/crunchbase/crunchbase.ipynb create mode 100644 notebooks/datasets/linkedin/linkedin.ipynb create mode 100644 src/brightdata/datasets/__init__.py create mode 100644 src/brightdata/datasets/amazon/__init__.py create mode 100644 src/brightdata/datasets/amazon/products.py create mode 100644 src/brightdata/datasets/base.py create mode 100644 src/brightdata/datasets/client.py create mode 100644 src/brightdata/datasets/crunchbase/__init__.py create mode 100644 src/brightdata/datasets/crunchbase/companies.py create mode 100644 src/brightdata/datasets/goodreads/__init__.py create mode 100644 src/brightdata/datasets/goodreads/books.py create mode 100644 src/brightdata/datasets/imdb/__init__.py create mode 100644 src/brightdata/datasets/imdb/movies.py create mode 100644 src/brightdata/datasets/linkedin/__init__.py create mode 100644 src/brightdata/datasets/linkedin/company_profiles.py create mode 100644 src/brightdata/datasets/linkedin/people_profiles.py create mode 100644 src/brightdata/datasets/models.py create mode 100644 src/brightdata/datasets/nba/__init__.py create mode 100644 src/brightdata/datasets/nba/players_stats.py create mode 100644 src/brightdata/datasets/utils.py create mode 100644 src/brightdata/datasets/world_population/__init__.py create mode 100644 src/brightdata/datasets/world_population/countries.py diff --git a/CHANGELOG.md b/CHANGELOG.md index fc1b06f..ee5c5dd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,39 @@ # Bright Data Python SDK Changelog +## Version 2.2.0 - Datasets API + +### ✨ New Features + +#### Datasets API +Access Bright Data's pre-collected datasets with filtering and export capabilities. + +```python +async with BrightDataClient() as client: + # Filter dataset records + snapshot_id = await client.datasets.amazon_products.filter( + filter={"name": "rating", "operator": ">=", "value": 4.5}, + records_limit=100 + ) + # Download results + data = await client.datasets.amazon_products.download(snapshot_id) +``` + +**8 Datasets:** LinkedIn Profiles, LinkedIn Companies, Amazon Products, Crunchbase Companies, IMDB Movies, NBA Players Stats, Goodreads Books, World Population + +**Export Utilities:** +```python +from brightdata.datasets import export_json, export_csv +export_json(data, "results.json") +export_csv(data, "results.csv") +``` + +### 📓 Notebooks +- `notebooks/datasets/linkedin/linkedin.ipynb` - LinkedIn datasets (profiles & companies) +- `notebooks/datasets/amazon/amazon.ipynb` - Amazon products dataset +- `notebooks/datasets/crunchbase/crunchbase.ipynb` - Crunchbase companies dataset + +--- + ## Version 2.1.2 - Web Scrapers & Notebooks ### 🐛 Bug Fixes diff --git a/LICENSE b/LICENSE index 3743c5b..f67927a 100644 --- a/LICENSE +++ b/LICENSE @@ -19,4 +19,3 @@ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. - diff --git a/notebooks/datasets/amazon/amazon.ipynb b/notebooks/datasets/amazon/amazon.ipynb new file mode 100644 index 0000000..9ca9e47 --- /dev/null +++ b/notebooks/datasets/amazon/amazon.ipynb @@ -0,0 +1,490 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🛒 Amazon Products Dataset API\n", + "\n", + "Access Bright Data's pre-collected Amazon Products dataset:\n", + "- **85 fields** including pricing, ratings, reviews, categories, and more\n", + "- Filter by price, rating, brand, category, availability, and other criteria\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "print(\"Client initialized\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: Explore Amazon Products Fields\n", + "\n", + "Before filtering, explore available fields using the class metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== Amazon Products Dataset ===\n", + "Dataset ID: gd_l7q7dkf244hwjntr0\n", + "Total fields: 85\n", + "\n", + "Field types:\n", + " Text fields: 38\n", + " Number fields: 14\n", + " Array fields: 15\n", + " Boolean fields: 7\n", + " URL fields: 7\n" + ] + } + ], + "source": [ + "from brightdata.datasets import AmazonProducts\n", + "\n", + "print(\"=== Amazon Products Dataset ===\")\n", + "print(f\"Dataset ID: {AmazonProducts.DATASET_ID}\")\n", + "print(f\"Total fields: {len(AmazonProducts.FIELDS)}\")\n", + "\n", + "# Show field types breakdown\n", + "print(f\"\\nField types:\")\n", + "print(f\" Text fields: {len(AmazonProducts.get_fields_by_type('text'))}\")\n", + "print(f\" Number fields: {len(AmazonProducts.get_fields_by_type('number'))}\")\n", + "print(f\" Array fields: {len(AmazonProducts.get_fields_by_type('array'))}\")\n", + "print(f\" Boolean fields: {len(AmazonProducts.get_fields_by_type('boolean'))}\")\n", + "print(f\" URL fields: {len(AmazonProducts.get_fields_by_type('url'))}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Get Dataset Metadata from API" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching Amazon Products metadata from API...\n", + "\n", + "Dataset ID: gd_l7q7dkf244hwjntr0\n", + "Total fields from API: 86\n", + "\n", + "=== Sample Fields ===\n", + " title: text - Product title\n", + " seller_name: text - Seller name\n", + " brand: text - Product brand\n", + " description: text - A brief description of the product\n", + " initial_price: price - Initial price\n", + " currency: text - Currency of the product\n", + " availability: text - Product availability\n", + " reviews_count: number - Number of reviews\n", + " categories: array - Product categories\n", + " parent_asin: text - Parent ASIN of the product\n" + ] + } + ], + "source": [ + "print(\"Fetching Amazon Products metadata from API...\\n\")\n", + "\n", + "async with client:\n", + " metadata = await client.datasets.amazon_products.get_metadata()\n", + "\n", + "print(f\"Dataset ID: {metadata.id}\")\n", + "print(f\"Total fields from API: {len(metadata.fields)}\")\n", + "\n", + "print(\"\\n=== Sample Fields ===\")\n", + "for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):\n", + " print(f\" {name}: {field.type} - {field.description or 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Keyword Search with Rating Filter\n", + "\n", + "Search for products by keyword and filter by rating." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter: Keyboards with rating >= 4.5\n", + "Records limit: 2\n", + "\n", + "Snapshot created: snap_mley0j875vz72i0rb\n", + "\n", + "Run the next cell to download the data...\n" + ] + } + ], + "source": [ + "# Step 1: Create filter and get snapshot_id\n", + "# Search for keyboards with 4.5+ star rating\n", + "FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"title\", \"operator\": \"includes\", \"value\": \"keyboard\"},\n", + " {\"name\": \"rating\", \"operator\": \">=\", \"value\": 4.5}\n", + " ]\n", + "}\n", + "LIMIT = 2\n", + "\n", + "print(\"Filter: Keyboards with rating >= 4.5\")\n", + "print(f\"Records limit: {LIMIT}\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=FILTER,\n", + " records_limit=LIMIT\n", + " )\n", + "\n", + " \n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")\n", + "print(\"\\nRun the next cell to download the data...\")" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading snapshot: snap_mley0j875vz72i0rb\n", + "(This will poll until ready...)\n", + "\n" + ] + }, + { + "ename": "TimeoutError", + "evalue": "Snapshot snap_mley0j875vz72i0rb not ready after 300s (status: scheduled)", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[9]\u001b[39m\u001b[32m, line 6\u001b[39m\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33m\"\u001b[39m\u001b[33m(This will poll until ready...)\u001b[39m\u001b[38;5;130;01m\\n\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 5\u001b[39m \u001b[38;5;28;01masync\u001b[39;00m \u001b[38;5;28;01mwith\u001b[39;00m client:\n\u001b[32m----> \u001b[39m\u001b[32m6\u001b[39m data = \u001b[38;5;28;01mawait\u001b[39;00m client.datasets.amazon_products.download(\n\u001b[32m 7\u001b[39m snapshot_id,\n\u001b[32m 8\u001b[39m timeout=\u001b[32m300\u001b[39m,\n\u001b[32m 9\u001b[39m poll_interval=\u001b[32m5\u001b[39m\n\u001b[32m 10\u001b[39m )\n\u001b[32m 12\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mDownloaded \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m products:\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 13\u001b[39m \u001b[38;5;28;01mfor\u001b[39;00m product \u001b[38;5;129;01min\u001b[39;00m data[:\u001b[32m5\u001b[39m]:\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/sdk-python/src/brightdata/datasets/base.py:156\u001b[39m, in \u001b[36mBaseDataset.download\u001b[39m\u001b[34m(self, snapshot_id, format, timeout, poll_interval)\u001b[39m\n\u001b[32m 154\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetError(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.error\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 155\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m time.time() - start_time > timeout:\n\u001b[32m--> \u001b[39m\u001b[32m156\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\n\u001b[32m 157\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msnapshot_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not ready after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33ms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 158\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m(status: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.status\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 159\u001b[39m )\n\u001b[32m 161\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(poll_interval)\n\u001b[32m 163\u001b[39m \u001b[38;5;66;03m# Download data\u001b[39;00m\n", + "\u001b[31mTimeoutError\u001b[39m: Snapshot snap_mley0j875vz72i0rb not ready after 300s (status: scheduled)" + ] + } + ], + "source": [ + "# Step 2: Download data (polls until ready)\n", + "print(f\"Downloading snapshot: {snapshot_id}\")\n", + "print(\"(This will poll until ready...)\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(\n", + " snapshot_id,\n", + " timeout=300,\n", + " poll_interval=5\n", + " )\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data[:5]:\n", + " print(f\"\\n Title: {product.get('title', 'N/A')[:60]}...\")\n", + " print(f\" Rating: {product.get('rating', 'N/A')} ({product.get('reviews_count', 0)} reviews)\")\n", + " print(f\" Price: {product.get('currency', '')} {product.get('final_price', 'N/A')}\")\n", + " print(f\" Brand: {product.get('brand', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 4: Filter by Price Range\n", + "\n", + "Find products in a specific price range." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "PRICE_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"final_price\", \"operator\": \">=\", \"value\": 50},\n", + " {\"name\": \"final_price\", \"operator\": \"<=\", \"value\": 100}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Products priced $50-$100\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=PRICE_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data:\n", + " print(f\" - {product.get('title', 'N/A')[:50]}... - ${product.get('final_price', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 5: Filter by Availability and Prime\n", + "\n", + "Find available Prime-eligible products." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "PRIME_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"is_available\", \"operator\": \"=\", \"value\": True},\n", + " {\"name\": \"amazon_prime\", \"operator\": \"=\", \"value\": True}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Available + Prime eligible\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=PRIME_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data:\n", + " print(f\"\\n Title: {product.get('title', 'N/A')[:50]}...\")\n", + " print(f\" Available: {product.get('is_available', 'N/A')}\")\n", + " print(f\" Prime: {product.get('amazon_prime', 'N/A')}\")\n", + " print(f\" Price: ${product.get('final_price', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 6: Filter by Brand" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "BRAND = \"Apple\"\n", + "\n", + "BRAND_FILTER = {\n", + " \"name\": \"brand\",\n", + " \"operator\": \"=\",\n", + " \"value\": BRAND\n", + "}\n", + "\n", + "print(f\"Filter: Brand = {BRAND}\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.amazon_products.filter(\n", + " filter=BRAND_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.amazon_products.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} products:\")\n", + "for product in data:\n", + " print(f\" - {product.get('title', 'N/A')[:60]}...\")\n", + " print(f\" Brand: {product.get('brand', 'N/A')}, Price: ${product.get('final_price', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Export Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if data:\n", + " output_file = Path.cwd() / \"amazon_dataset_results.json\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data, f, indent=2, default=str)\n", + " \n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Records: {len(data)}\")\n", + "else:\n", + " print(\"No data to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"amazon_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"amazon_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/datasets/crunchbase/crunchbase.ipynb b/notebooks/datasets/crunchbase/crunchbase.ipynb new file mode 100644 index 0000000..0babefd --- /dev/null +++ b/notebooks/datasets/crunchbase/crunchbase.ipynb @@ -0,0 +1,588 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 🏢 Crunchbase Companies Dataset API\n", + "\n", + "Access Bright Data's pre-collected Crunchbase Companies dataset:\n", + "- **2.3M+ companies** with **98 fields**\n", + "- Filter by funding, employees, industry, location, and more\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "print(\"Client initialized\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: Explore Crunchbase Fields\n", + "\n", + "Explore the 98 available fields." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from brightdata.datasets import CrunchbaseCompanies\n", + "\n", + "print(\"=== Crunchbase Companies Dataset ===\")\n", + "print(f\"Dataset ID: {CrunchbaseCompanies.DATASET_ID}\")\n", + "print(f\"Total fields: {len(CrunchbaseCompanies.FIELDS)}\")\n", + "\n", + "# High fill rate fields (most reliable for filtering)\n", + "high_fill = CrunchbaseCompanies.get_high_fill_rate_fields(min_rate=90.0)\n", + "print(f\"\\nHigh fill rate fields (>90%): {len(high_fill)}\")\n", + "for field in high_fill[:10]:\n", + " info = CrunchbaseCompanies.FIELDS[field]\n", + " print(f\" - {field}: {info['type']} ({info['fill_rate']}%)\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show fields by type\n", + "print(\"\\n=== Fields by Type ===\")\n", + "print(f\"Text fields: {len(CrunchbaseCompanies.get_fields_by_type('text'))}\")\n", + "print(f\"Number fields: {len(CrunchbaseCompanies.get_fields_by_type('number'))}\")\n", + "print(f\"Array fields: {len(CrunchbaseCompanies.get_fields_by_type('array'))}\")\n", + "print(f\"Object fields: {len(CrunchbaseCompanies.get_fields_by_type('object'))}\")\n", + "print(f\"URL fields: {len(CrunchbaseCompanies.get_fields_by_type('url'))}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Show number fields (useful for filtering)\n", + "print(\"\\n=== Number Fields (for numeric filtering) ===\")\n", + "for field in CrunchbaseCompanies.get_fields_by_type('number')[:15]:\n", + " info = CrunchbaseCompanies.FIELDS[field]\n", + " print(f\" - {field}: {info['description'][:50]}... ({info['fill_rate']}%)\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Get Dataset Metadata from API" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(\"Fetching Crunchbase metadata from API...\\n\")\n", + "\n", + "async with client:\n", + " metadata = await client.datasets.crunchbase_companies.get_metadata()\n", + "\n", + "print(f\"Dataset ID: {metadata.id}\")\n", + "print(f\"Total fields from API: {len(metadata.fields)}\")\n", + "\n", + "print(\"\\n=== Sample Fields ===\")\n", + "for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):\n", + " print(f\" {name}: {field.type} - {field.description or 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Filter by Operating Status\n", + "\n", + "Find active companies." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter and get snapshot_id\n", + "FILTER = {\n", + " \"name\": \"operating_status\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"active\"\n", + "}\n", + "LIMIT = 10\n", + "\n", + "print(f\"Filter: {FILTER}\")\n", + "print(f\"Records limit: {LIMIT}\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=FILTER,\n", + " records_limit=LIMIT\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data (polls until ready)\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data[:5]:\n", + " print(f\"\\n Name: {company.get('name', 'N/A')}\")\n", + " print(f\" Status: {company.get('operating_status', 'N/A')}\")\n", + " print(f\" Industries: {company.get('industries', 'N/A')}\")\n", + " print(f\" Employees: {company.get('num_employees', 'N/A')}\")\n", + " print(f\" Website: {company.get('website', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Create filter\n", + "EMPLOYEE_FILTER = {\n", + " \"name\": \"num_employee_profiles\",\n", + " \"operator\": \">\",\n", + " \"value\": 100\n", + "}\n", + "\n", + "print(f\"Filter: Companies with >100 employee profiles\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=EMPLOYEE_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')}\")\n", + " print(f\" Employee profiles: {company.get('num_employee_profiles', 'N/A')}\")\n", + " print(f\" Employees: {company.get('num_employees', 'N/A')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "COUNTRY_FILTER = {\n", + " \"name\": \"country_code\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"USA\"\n", + "}\n", + "\n", + "print(\"Filter: US-based companies\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=COUNTRY_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')} ({company.get('country_code', 'N/A')})\")\n", + " print(f\" HQ: {company.get('address', 'N/A')[:50]}...\" if company.get('address') and len(company.get('address', '')) > 50 else f\" HQ: {company.get('address', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Step 1: Create filter\n", + "FUNDED_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"},\n", + " {\"name\": \"num_investors\", \"operator\": \">\", \"value\": 0}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Active companies with investors\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=FUNDED_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\"\\n Name: {company.get('name', 'N/A')}\")\n", + " print(f\" Status: {company.get('operating_status', 'N/A')}\")\n", + " print(f\" Investors: {company.get('num_investors', 'N/A')}\")\n", + " print(f\" Industries: {company.get('industries', 'N/A')}\")\n", + " print(f\" CB Rank: {company.get('cb_rank', 'N/A')}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 1: Create filter\n", + "IPO_FILTER = {\n", + " \"name\": \"ipo_status\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"public\"\n", + "}\n", + "\n", + "print(\"Filter: Public companies\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=IPO_FILTER,\n", + " records_limit=5\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Step 2: Download data\n", + "print(f\"Downloading snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"Downloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')}\")\n", + " print(f\" IPO Status: {company.get('ipo_status', 'N/A')}\")\n", + " print(f\" Stock Symbol: {company.get('stock_symbol', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"crunchbase_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"crunchbase_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Combined filter: active companies with investors\n", + "FUNDED_FILTER = {\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"},\n", + " {\"name\": \"num_investors\", \"operator\": \">\", \"value\": 0}\n", + " ]\n", + "}\n", + "\n", + "print(\"Filter: Active companies with investors\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=FUNDED_FILTER,\n", + " records_limit=5\n", + " )\n", + " print(f\"Snapshot: {snapshot_id}\")\n", + " \n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"\\nDownloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\"\\n Name: {company.get('name', 'N/A')}\")\n", + " print(f\" Status: {company.get('operating_status', 'N/A')}\")\n", + " print(f\" Investors: {company.get('num_investors', 'N/A')}\")\n", + " print(f\" Industries: {company.get('industries', 'N/A')}\")\n", + " print(f\" CB Rank: {company.get('cb_rank', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Filter by IPO Status" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Filter: public companies\n", + "IPO_FILTER = {\n", + " \"name\": \"ipo_status\",\n", + " \"operator\": \"=\",\n", + " \"value\": \"public\"\n", + "}\n", + "\n", + "print(\"Filter: Public companies\")\n", + "print(f\"Records limit: 5\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " filter=IPO_FILTER,\n", + " records_limit=5\n", + " )\n", + " print(f\"Snapshot: {snapshot_id}\")\n", + " \n", + " data = await client.datasets.crunchbase_companies.download(snapshot_id)\n", + "\n", + "print(f\"\\nDownloaded {len(data)} companies:\")\n", + "for company in data:\n", + " print(f\" - {company.get('name', 'N/A')}\")\n", + " print(f\" IPO Status: {company.get('ipo_status', 'N/A')}\")\n", + " print(f\" Stock Symbol: {company.get('stock_symbol', 'N/A')}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 8: Export Results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if data:\n", + " output_file = Path.cwd() / \"crunchbase_dataset_results.json\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data, f, indent=2, default=str)\n", + " \n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Records: {len(data)}\")\n", + "else:\n", + " print(\"No data to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "### Crunchbase Companies Dataset\n", + "\n", + "| Info | Value |\n", + "|------|-------|\n", + "| Dataset ID | `gd_l1vijqt9jfj7olije` |\n", + "| Records | 2.3M+ companies |\n", + "| Total Fields | 98 |\n", + "| Access | `client.datasets.crunchbase_companies` |\n", + "\n", + "### Key Fields for Filtering\n", + "\n", + "| Field | Type | Fill Rate | Description |\n", + "|-------|------|-----------|-------------|\n", + "| `name` | text | 100% | Company name |\n", + "| `operating_status` | text | 100% | active, closed, etc. |\n", + "| `ipo_status` | text | 99.9% | public, private, etc. |\n", + "| `country_code` | text | 93.5% | Country code |\n", + "| `cb_rank` | number | 97% | Crunchbase rank |\n", + "| `num_employees` | text | 86.3% | Employee count range |\n", + "| `num_employee_profiles` | number | 99.9% | LinkedIn profiles |\n", + "| `num_investors` | number | 8.2% | Investor count |\n", + "| `industries` | array | 94.5% | Industry categories |\n", + "\n", + "### Example Filters\n", + "\n", + "```python\n", + "# Active companies\n", + "{\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"}\n", + "\n", + "# Public companies\n", + "{\"name\": \"ipo_status\", \"operator\": \"=\", \"value\": \"public\"}\n", + "\n", + "# US-based companies\n", + "{\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"USA\"}\n", + "\n", + "# Companies with funding\n", + "{\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"operating_status\", \"operator\": \"=\", \"value\": \"active\"},\n", + " {\"name\": \"num_investors\", \"operator\": \">\", \"value\": 0}\n", + " ]\n", + "}\n", + "```\n", + "\n", + "### Helper Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `get_field_names()` | List all 98 field names |\n", + "| `get_high_fill_rate_fields(min_rate)` | Fields above fill rate threshold |\n", + "| `get_fields_by_type(type)` | Fields of specific type |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/notebooks/datasets/linkedin/linkedin.ipynb b/notebooks/datasets/linkedin/linkedin.ipynb new file mode 100644 index 0000000..d439317 --- /dev/null +++ b/notebooks/datasets/linkedin/linkedin.ipynb @@ -0,0 +1,754 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 📊 LinkedIn Datasets API\n", + "\n", + "Access Bright Data's pre-collected LinkedIn datasets:\n", + "- **LinkedIn People Profiles**: 620M+ profiles with 42 fields\n", + "- **LinkedIn Company Profiles**: 58.5M+ companies with 36 fields\n", + "\n", + "Unlike web scrapers that collect data on-demand, datasets provide instant access to pre-collected, structured data filtered by your criteria.\n", + "\n", + "---" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Setup" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "API Token: 7011787d-2...3336\n", + "Setup complete!\n" + ] + } + ], + "source": [ + "import os\n", + "from dotenv import load_dotenv\n", + "load_dotenv()\n", + "\n", + "API_TOKEN = os.getenv(\"BRIGHTDATA_API_TOKEN\")\n", + "if not API_TOKEN:\n", + " raise ValueError(\"Set BRIGHTDATA_API_TOKEN in .env file\")\n", + "\n", + "print(f\"API Token: {API_TOKEN[:10]}...{API_TOKEN[-4:]}\")\n", + "print(\"Setup complete!\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Initialize Client" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Client initialized\n", + "Available datasets: linkedin_profiles, linkedin_companies, amazon_products, crunchbase_companies\n" + ] + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient(token=API_TOKEN)\n", + "\n", + "print(\"Client initialized\")\n", + "print(f\"Available datasets: linkedin_profiles, linkedin_companies, amazon_products, crunchbase_companies\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 1: List Available Datasets\n", + "\n", + "List all datasets available in your account." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching available datasets...\n", + "\n", + "Found 174 datasets:\n", + "\n", + " - Crunchbase companies information\n", + " ID: gd_l1vijqt9jfj7olije\n", + " Size: 2.3M records\n", + "\n", + " - Instagram - Profiles\n", + " ID: gd_l1vikfch901nx3by4\n", + " Size: 620.0M records\n", + "\n", + " - Manta businesses \n", + " ID: gd_l1vil1d81g0u8763b2\n", + " Size: 5.6M records\n", + "\n", + " - US lawyers directory\n", + " ID: gd_l1vil5n11okchcbvax\n", + " Size: 1.4M records\n", + "\n", + " - LinkedIn company information\n", + " ID: gd_l1vikfnt1wgvvqz95w\n", + " Size: 55.0M records\n", + "\n", + " - LinkedIn people profiles\n", + " ID: gd_l1viktl72bvl7bjuj0\n", + " Size: 115.0M records\n", + "\n", + " - TikTok - Profiles\n", + " ID: gd_l1villgoiiidt09ci\n", + " Size: 152.0M records\n", + "\n", + " - Slintel 6sense company information\n", + " ID: gd_l1vilg5a1decoahvgq\n", + " Size: 10.9M records\n", + "\n", + " - Owler companies information\n", + " ID: gd_l1vilaxi10wutoage7\n", + " Size: 6.1M records\n", + "\n", + " - VentureRadar company information\n", + " ID: gd_l1vilsfd1xpsndbtpr\n", + " Size: 0.3M records\n", + "\n" + ] + } + ], + "source": [ + "print(\"Fetching available datasets...\\n\")\n", + "\n", + "async with client:\n", + " datasets = await client.datasets.list()\n", + "\n", + "print(f\"Found {len(datasets)} datasets:\\n\")\n", + "for ds in datasets[:10]: # Show first 10\n", + " size_m = ds.size / 1_000_000 if ds.size else 0\n", + " print(f\" - {ds.name}\")\n", + " print(f\" ID: {ds.id}\")\n", + " print(f\" Size: {size_m:.1f}M records\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 2: Explore LinkedIn Profiles Fields\n", + "\n", + "Before filtering, explore available fields using the class metadata." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "=== LinkedIn People Profiles Dataset ===\n", + "Dataset ID: gd_l1viktl72bvl7bjuj0\n", + "Total fields: 42\n", + "\n", + "High fill rate fields (>70%): 19\n", + " - id: text (100.0%)\n", + " A unique identifier for the person's LinkedIn profile\n", + " - name: text (97.54%)\n", + " Profile name\n", + " - first_name: text (95.1%)\n", + " First name of the user\n", + " - last_name: text (94.8%)\n", + " Last name of the user\n", + " - city: text (96.3%)\n", + " Geographical location of the user\n", + " - country_code: text (97.11%)\n", + " Geographical location of the user\n", + " - position: text (91.23%)\n", + " The current job title or position of the profile\n", + " - url: url (100.0%)\n", + " URL that links directly to the LinkedIn profile\n", + " - input_url: url (100.0%)\n", + " The URL that was entered when starting the scraping process\n", + " - linkedin_id: text (100.0%)\n", + " LinkedIn profile identifier\n", + " - linkedin_num_id: text (100.0%)\n", + " Numeric LinkedIn profile ID\n", + " - avatar: url (96.28%)\n", + " URL that links to the profile picture of the LinkedIn user\n", + " - banner_image: url (96.28%)\n", + " Banner image URL\n", + " - default_avatar: boolean (95.73%)\n", + " Is the avatar picture the default empty picture\n", + " - followers: number (71.39%)\n", + " How many users/companies following the profile\n", + " - connections: number (70.33%)\n", + " How many connections the profile has\n", + " - memorialized_account: boolean (99.44%)\n", + " Boolean indicating if the account is memorialized\n", + " - current_company: object (100.0%)\n", + " Current professional position info: company name, job title, company ID, industry\n", + " - experience: array (71.49%)\n", + " Professional history: job titles, dates, companies, locations\n" + ] + } + ], + "source": [ + "from brightdata.datasets import LinkedInPeopleProfiles\n", + "\n", + "print(\"=== LinkedIn People Profiles Dataset ===\")\n", + "print(f\"Dataset ID: {LinkedInPeopleProfiles.DATASET_ID}\")\n", + "print(f\"Total fields: {len(LinkedInPeopleProfiles.FIELDS)}\")\n", + "\n", + "# Get high fill rate fields (more reliable for filtering)\n", + "high_fill = LinkedInPeopleProfiles.get_high_fill_rate_fields(min_rate=70.0)\n", + "print(f\"\\nHigh fill rate fields (>70%): {len(high_fill)}\")\n", + "for field in high_fill:\n", + " info = LinkedInPeopleProfiles.FIELDS[field]\n", + " print(f\" - {field}: {info['type']} ({info['fill_rate']}%)\")\n", + " print(f\" {info['description']}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "=== All Available Fields ===\n", + " id: text - 100.0%\n", + " name: text - 97.54%\n", + " first_name: text - 95.1%\n", + " last_name: text - 94.8%\n", + " city: text - 96.3%\n", + " country_code: text - 97.11%\n", + " location: text - 61.93%\n", + " position: text - 91.23%\n", + " about: text - 18.9%\n", + " url: url - 100.0%\n", + " input_url: url - 100.0%\n", + " linkedin_id: text - 100.0%\n", + " linkedin_num_id: text - 100.0%\n", + " avatar: url - 96.28%\n", + " banner_image: url - 96.28%\n", + " default_avatar: boolean - 95.73%\n", + " followers: number - 71.39%\n", + " connections: number - 70.33%\n", + " recommendations_count: number - 3.65%\n", + " influencer: boolean - 46.06%\n", + " memorialized_account: boolean - 99.44%\n", + " current_company_name: text - 69.6%\n", + " current_company_company_id: text - 38.94%\n", + " current_company: object - 100.0%\n", + " experience: array - 71.49%\n", + " education: array - 41.97%\n", + " educations_details: text - 42.08%\n", + " posts: array - 1.27%\n", + " activity: array - 32.95%\n", + " certifications: array - 8.35%\n", + " courses: array - 2.55%\n", + " languages: array - 9.19%\n", + " publications: array - 1.23%\n", + " patents: array - 0.13%\n", + " projects: array - 2.08%\n", + " honors_and_awards: array - 2.13%\n", + " recommendations: array - 3.61%\n", + " volunteer_experience: array - 4.12%\n", + " organizations: array - 1.78%\n", + " people_also_viewed: array - 33.36%\n", + " similar_profiles: array - 0.58%\n", + " bio_links: array - 2.96%\n" + ] + } + ], + "source": [ + "# Show all available field names\n", + "print(\"\\n=== All Available Fields ===\")\n", + "for name, info in LinkedInPeopleProfiles.FIELDS.items():\n", + " print(f\" {name}: {info['type']} - {info.get('fill_rate', 'N/A')}%\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 3: Get Dataset Metadata from API\n", + "\n", + "Fetch live metadata from the API to see current field schema." + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Fetching LinkedIn Profiles metadata from API...\n", + "\n", + "Dataset ID: gd_l1viktl72bvl7bjuj0\n", + "Total fields from API: 45\n", + "\n", + "=== Sample Fields ===\n", + " id:\n", + " type: text\n", + " active: True\n", + " description: A unique identifier for the person's LinkedIn profile\n", + " name:\n", + " type: text\n", + " active: True\n", + " description: Profile name\n", + " city:\n", + " type: text\n", + " active: True\n", + " description: Geographical location of the user\n", + " country_code:\n", + " type: text\n", + " active: True\n", + " description: Geographical location of the user\n", + " position:\n", + " type: text\n", + " active: True\n", + " description: The current job title or position of the profile\n", + " about:\n", + " type: text\n", + " active: True\n", + " description: A concise profile summary. In some cases, only a truncated version with \"…\" is displayed on the website, and this is the version we capture\n", + " posts:\n", + " type: array\n", + " active: True\n", + " description: Contains information related to the user's last LinkedIn posts. It typically includes the post title, created date, URL link to the post, etc.\n", + " groups:\n", + " type: array\n", + " active: False\n", + " description: The LinkedIn groups that the profile is a part of\n", + " current_company:\n", + " type: object\n", + " active: True\n", + " description: Provides information about the user's current professional position. It typically includes the company name, the user's job title, the company ID, and the industry or sector to which the company belongs\n", + " experience:\n", + " type: array\n", + " active: True\n", + " description: Contains information about user's professional history. It typically includes the user's job title, length of time the user held the position, the geographic location of the company, the start and end date, the company name, URL link to the company profile, etc.\n" + ] + } + ], + "source": [ + "print(\"Fetching LinkedIn Profiles metadata from API...\\n\")\n", + "\n", + "async with client:\n", + " metadata = await client.datasets.linkedin_profiles.get_metadata()\n", + "\n", + "print(f\"Dataset ID: {metadata.id}\")\n", + "print(f\"Total fields from API: {len(metadata.fields)}\")\n", + "\n", + "print(\"\\n=== Sample Fields ===\")\n", + "for i, (name, field) in enumerate(list(metadata.fields.items())[:10]):\n", + " print(f\" {name}:\")\n", + " print(f\" type: {field.type}\")\n", + " print(f\" active: {field.active}\")\n", + " print(f\" description: {field.description or 'N/A'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 4: Filter Dataset (Simple Filter)\n", + "\n", + "Filter profiles by a single criterion. Returns a snapshot_id for later download." + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Filter: {'name': 'followers', 'operator': '>', 'value': 10000}\n", + "Records limit: 2\n", + "\n", + "Snapshot created: snap_mlev60jlf03ta3ev\n", + "\n", + "Note: filter() returns immediately with a snapshot_id.\n", + "The snapshot is built asynchronously - use get_status() or download() next.\n" + ] + } + ], + "source": [ + "# Simple filter: profiles with 10,000+ followers\n", + "FILTER = {\n", + " \"name\": \"followers\",\n", + " \"operator\": \">\",\n", + " \"value\": 10000\n", + "}\n", + "LIMIT = 2 # Only get 2 records for demo\n", + "\n", + "print(f\"Filter: {FILTER}\")\n", + "print(f\"Records limit: {LIMIT}\\n\")\n", + "\n", + "async with client:\n", + " snapshot_id = await client.datasets.linkedin_profiles.filter(\n", + " filter=FILTER,\n", + " records_limit=LIMIT\n", + " )\n", + "\n", + "print(f\"Snapshot created: {snapshot_id}\")\n", + "print(\"\\nNote: filter() returns immediately with a snapshot_id.\")\n", + "print(\"The snapshot is built asynchronously - use get_status() or download() next.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 5: Check Snapshot Status\n", + "\n", + "Check the status of a snapshot before downloading." + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Checking status for snapshot: snap_mlev60jlf03ta3ev\n", + "\n", + "=== Snapshot Status ===\n", + "ID: snap_mlev60jlf03ta3ev\n", + "Status: ready\n", + "Dataset ID: gd_l1viktl72bvl7bjuj0\n", + "Records: 2\n", + "File size: 21733 bytes\n", + "Cost: $0\n" + ] + } + ], + "source": [ + "print(f\"Checking status for snapshot: {snapshot_id}\\n\")\n", + "\n", + "async with client:\n", + " status = await client.datasets.linkedin_profiles.get_status(snapshot_id)\n", + "\n", + "print(f\"=== Snapshot Status ===\")\n", + "print(f\"ID: {status.id}\")\n", + "print(f\"Status: {status.status}\")\n", + "print(f\"Dataset ID: {status.dataset_id}\")\n", + "print(f\"Records: {status.dataset_size}\")\n", + "print(f\"File size: {status.file_size} bytes\")\n", + "print(f\"Cost: ${status.cost}\")\n", + "\n", + "if status.error:\n", + " print(f\"Error: {status.error}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 6: Download Snapshot Data\n", + "\n", + "Download the filtered data. This polls until ready, then returns the records." + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "snapshot_id=\"snap_mlev60jlf03ta3ev\"" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Downloading snapshot: snap_mlev60jlf03ta3ev\n", + "(This will poll until ready...)\n", + "\n", + "Downloaded 2 profiles\n", + "\n", + "=== Profile 1 ===\n", + " Name: Jacques Wakefield\n", + " Position: Affiliate Marketer\n", + " City: Jackson, Tennessee, United States\n", + " Country: US\n", + " Followers: 15700\n", + " Connections: 500\n", + " URL: https://linkedin.com/in/jacqueswakefield\n", + "\n", + "=== Profile 2 ===\n", + " Name: Ajay Anand\n", + " Position: Ajay Anand, EY Global Vice Chair, Global Delivery Services |Innovator | Technologist | Board Advisor\n", + " City: San Francisco Bay Area\n", + " Country: US\n", + " Followers: 10649\n", + " Connections: 500\n", + " URL: https://ae.linkedin.com/in/ajay-anand-1912512\n", + "\n" + ] + } + ], + "source": [ + "print(f\"Downloading snapshot: {snapshot_id}\")\n", + "print(\"(This will poll until ready...)\\n\")\n", + "\n", + "async with client:\n", + " data = await client.datasets.linkedin_profiles.download(\n", + " snapshot_id,\n", + " format=\"jsonl\",\n", + " timeout=300, # 5 minutes\n", + " poll_interval=5 # Check every 5 seconds\n", + " )\n", + "\n", + "print(f\"Downloaded {len(data)} profiles\\n\")\n", + "\n", + "# Display first few profiles\n", + "for i, profile in enumerate(data[:3]):\n", + " print(f\"=== Profile {i+1} ===\")\n", + " print(f\" Name: {profile.get('name', 'N/A')}\")\n", + " print(f\" Position: {profile.get('position', 'N/A')}\")\n", + " print(f\" City: {profile.get('city', 'N/A')}\")\n", + " print(f\" Country: {profile.get('country_code', 'N/A')}\")\n", + " print(f\" Followers: {profile.get('followers', 'N/A')}\")\n", + " print(f\" Connections: {profile.get('connections', 'N/A')}\")\n", + " print(f\" URL: {profile.get('url', 'N/A')}\")\n", + " print()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Combined Filter (AND/OR)\n", + "\n", + "Filter with multiple conditions using AND/OR operators." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Step 1: Create filter\nCOMBINED_FILTER = {\n \"operator\": \"and\",\n \"filters\": [\n {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n ]\n}\n\nprint(\"Filter: US-based profiles with 5000+ followers\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_profiles.filter(\n filter=COMBINED_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + }, + { + "cell_type": "code", + "source": "# Step 2: Download data\nprint(f\"Downloading snapshot: {snapshot_id}\\n\")\n\nasync with client:\n data = await client.datasets.linkedin_profiles.download(snapshot_id)\n\nprint(f\"Downloaded {len(data)} profiles:\")\nfor profile in data:\n print(f\" - {profile.get('name', 'N/A')} ({profile.get('country_code', 'N/A')}) - {profile.get('followers', 0)} followers\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 8: LinkedIn Company Profiles\n", + "\n", + "Access the LinkedIn Company Profiles dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "# Step 1: Create filter\nCOMPANY_FILTER = {\n \"name\": \"company_size\",\n \"operator\": \"=\",\n \"value\": \"1001-5000 employees\"\n}\n\nprint(f\"Filter: {COMPANY_FILTER}\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_companies.filter(\n filter=COMPANY_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + }, + { + "cell_type": "code", + "source": "# Step 2: Download data\nprint(f\"Downloading snapshot: {snapshot_id}\\n\")\n\nasync with client:\n data = await client.datasets.linkedin_companies.download(snapshot_id)\n\nprint(f\"Downloaded {len(data)} companies:\")\nfor company in data:\n print(f\"\\n=== {company.get('name', 'N/A')} ===\")\n print(f\" Industry: {company.get('industries', 'N/A')}\")\n print(f\" Size: {company.get('company_size', 'N/A')}\")\n print(f\" HQ: {company.get('headquarters', 'N/A')}\")\n print(f\" Website: {company.get('website', 'N/A')}\")\n print(f\" Followers: {company.get('followers', 'N/A')}\")", + "metadata": {}, + "execution_count": null, + "outputs": [] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": "from brightdata.datasets import export_json, export_csv, export\n\n# Export to JSON\njson_file = export_json(data, \"linkedin_results.json\")\nprint(f\"Exported to: {json_file}\")\n\n# Export to CSV\ncsv_file = export_csv(data, \"linkedin_results.csv\")\nprint(f\"Exported to: {csv_file}\")\n\n# Or use auto-detect based on extension\n# export(data, \"results.json\")\n# export(data, \"results.csv\")\n\nprint(f\"\\nRecords: {len(data)}\")" + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 9: Export Results to JSON" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import json\n", + "from pathlib import Path\n", + "\n", + "if data:\n", + " output_file = Path.cwd() / \"linkedin_dataset_results.json\"\n", + " \n", + " with open(output_file, \"w\") as f:\n", + " json.dump(data, f, indent=2, default=str)\n", + " \n", + " print(f\"Exported to: {output_file}\")\n", + " print(f\"Records: {len(data)}\")\n", + "else:\n", + " print(\"No data to export\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Summary\n", + "\n", + "### Datasets vs Web Scrapers\n", + "\n", + "| Feature | Datasets | Web Scrapers |\n", + "|---------|----------|-------------|\n", + "| Data source | Pre-collected database | Live scraping |\n", + "| Speed | Instant filtering | Real-time collection |\n", + "| Use case | Bulk data, analytics | Specific URLs, fresh data |\n", + "| Pricing | Per record filtered | Per request |\n", + "\n", + "### Available LinkedIn Datasets\n", + "\n", + "| Dataset | Records | Fields | Access |\n", + "|---------|---------|--------|--------|\n", + "| LinkedIn People Profiles | 620M+ | 42 | `client.datasets.linkedin_profiles` |\n", + "| LinkedIn Company Profiles | 58.5M+ | 36 | `client.datasets.linkedin_companies` |\n", + "\n", + "### Dataset Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `get_metadata()` | Get field schema from API |\n", + "| `filter(filter, records_limit)` | Create filtered snapshot (returns snapshot_id) |\n", + "| `get_status(snapshot_id)` | Check snapshot status |\n", + "| `download(snapshot_id)` | Poll and download data |\n", + "\n", + "### Filter Operators\n", + "\n", + "| Operator | Description | Example |\n", + "|----------|-------------|---------|\n", + "| `=` | Equal to | `{\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"}` |\n", + "| `!=` | Not equal | `{\"name\": \"country_code\", \"operator\": \"!=\", \"value\": \"CN\"}` |\n", + "| `>`, `<`, `>=`, `<=` | Numeric comparison | `{\"name\": \"followers\", \"operator\": \">\", \"value\": 10000}` |\n", + "| `in` | Value in list | `{\"name\": \"country_code\", \"operator\": \"in\", \"value\": [\"US\", \"UK\"]}` |\n", + "| `includes` | Text contains | `{\"name\": \"position\", \"operator\": \"includes\", \"value\": \"Engineer\"}` |\n", + "| `is_null` | Field is null | `{\"name\": \"about\", \"operator\": \"is_null\"}` |\n", + "| `is_not_null` | Field is not null | `{\"name\": \"about\", \"operator\": \"is_not_null\"}` |\n", + "\n", + "### Combined Filters\n", + "\n", + "```python\n", + "# AND condition\n", + "{\n", + " \"operator\": \"and\",\n", + " \"filters\": [\n", + " {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n", + " {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n", + " ]\n", + "}\n", + "\n", + "# OR condition\n", + "{\n", + " \"operator\": \"or\",\n", + " \"filters\": [\n", + " {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n", + " {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"UK\"}\n", + " ]\n", + "}\n", + "```\n", + "\n", + "### Class Helper Methods\n", + "\n", + "| Method | Description |\n", + "|--------|-------------|\n", + "| `get_field_names()` | List all field names |\n", + "| `get_high_fill_rate_fields(min_rate)` | Fields with fill rate above threshold |" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.10" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index 654a9dd..adc068b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ where = ["src"] [project] name = "brightdata-sdk" -version = "2.1.2" +version = "2.2.0" description = "Modern async-first Python SDK for Bright Data APIs" authors = [{name = "Bright Data", email = "support@brightdata.com"}] license = {text = "MIT"} diff --git a/requirements.txt b/requirements.txt index 314c9e8..ba3dedb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,3 @@ tldextract>=5.0.0 pydantic>=2.0.0 pydantic-settings>=2.0.0 click>=8.1.0 - diff --git a/src/brightdata/cli/README.md b/src/brightdata/cli/README.md index 989b2d6..b0fdce0 100644 --- a/src/brightdata/cli/README.md +++ b/src/brightdata/cli/README.md @@ -195,4 +195,3 @@ brightdata scrape --help brightdata scrape amazon --help brightdata search --help ``` - diff --git a/src/brightdata/cli/banner.py b/src/brightdata/cli/banner.py index af63dd5..05111bf 100644 --- a/src/brightdata/cli/banner.py +++ b/src/brightdata/cli/banner.py @@ -40,37 +40,37 @@ def get_banner() -> str: Formatted banner string with colors """ banner = """ - + \033[1;33m██████╗ ██████╗ ██╗ ██████╗ ██╗ ██╗████████╗\033[0m \033[1;33m██╔══██╗██╔══██╗██║██╔════╝ ██║ ██║╚══██╔══╝\033[0m \033[1;33m██████╔╝██████╔╝██║██║ ███╗███████║ ██║ \033[0m \033[1;33m██╔══██╗██╔══██╗██║██║ ██║██╔══██║ ██║ \033[0m \033[1;33m██████╔╝██║ ██║██║╚██████╔╝██║ ██║ ██║ \033[0m \033[1;33m╚═════╝ ╚═╝ ╚═╝╚═╝ ╚═════╝ ╚═╝ ╚═╝ ╚═╝ \033[0m - + \033[1;35m██████╗ █████╗ ████████╗ █████╗ \033[0m \033[1;35m██╔══██╗██╔══██╗╚══██╔══╝██╔══██╗\033[0m \033[1;35m██║ ██║███████║ ██║ ███████║\033[0m \033[1;35m██║ ██║██╔══██║ ██║ ██╔══██║\033[0m \033[1;35m██████╔╝██║ ██║ ██║ ██║ ██║\033[0m \033[1;35m╚═════╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝\033[0m - + \033[1;32m██████╗ ██╗ ██╗████████╗██╗ ██╗ ██████╗ ███╗ ██╗\033[0m \033[1;32m██╔══██╗╚██╗ ██╔╝╚══██╔══╝██║ ██║██╔═══██╗████╗ ██║\033[0m \033[1;32m██████╔╝ ╚████╔╝ ██║ ███████║██║ ██║██╔██╗ ██║\033[0m \033[1;32m██╔═══╝ ╚██╔╝ ██║ ██╔══██║██║ ██║██║╚██╗██║\033[0m \033[1;32m██║ ██║ ██║ ██║ ██║╚██████╔╝██║ ╚████║\033[0m \033[1;32m╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═╝ ╚═════╝ ╚═╝ ╚═══╝\033[0m - + \033[1;37m███████╗██████╗ ██╗ ██╗\033[0m \033[1;37m██╔════╝██╔══██╗██║ ██╔╝\033[0m \033[1;37m███████╗██║ ██║█████╔╝ \033[0m \033[1;37m╚════██║██║ ██║██╔═██╗ \033[0m \033[1;37m███████║██████╔╝██║ ██╗\033[0m \033[1;37m╚══════╝╚═════╝ ╚═╝ ╚═╝\033[0m - + \033[1;93m🐍\033[0m - + """ return banner diff --git a/src/brightdata/client.py b/src/brightdata/client.py index 253ae5d..0f69649 100644 --- a/src/brightdata/client.py +++ b/src/brightdata/client.py @@ -27,6 +27,7 @@ from .api.scrape_service import ScrapeService from .api.search_service import SearchService from .api.crawler_service import CrawlerService +from .datasets import DatasetsClient from .models import ScrapeResult from .types import AccountInfo from .constants import ( @@ -131,6 +132,7 @@ def __init__( self._search_service: Optional[SearchService] = None self._crawler_service: Optional[CrawlerService] = None self._web_unlocker_service: Optional[WebUnlockerService] = None + self._datasets_client: Optional[DatasetsClient] = None self._zone_manager: Optional[ZoneManager] = None self._is_connected = False self._account_info: Optional[Dict[str, Any]] = None @@ -282,6 +284,35 @@ def crawler(self) -> CrawlerService: self._crawler_service = CrawlerService(self) return self._crawler_service + @property + def datasets(self) -> DatasetsClient: + """ + Access pre-collected datasets. + + Provides access to Bright Data's datasets with filtering capabilities: + - client.datasets.list() + - client.datasets.linkedin_profiles.get_metadata() + - client.datasets.linkedin_profiles.filter(...) + - client.datasets.linkedin_profiles.download(snapshot_id) + + Returns: + DatasetsClient instance for dataset operations + + Example: + >>> # List available datasets + >>> datasets = await client.datasets.list() + >>> + >>> # Filter LinkedIn profiles + >>> snapshot_id = await client.datasets.linkedin_profiles.filter( + ... filter={"name": "industry", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await client.datasets.linkedin_profiles.download(snapshot_id) + """ + if self._datasets_client is None: + self._datasets_client = DatasetsClient(self.engine) + return self._datasets_client + async def test_connection(self) -> bool: """ Test API connection and token validity. diff --git a/src/brightdata/datasets/__init__.py b/src/brightdata/datasets/__init__.py new file mode 100644 index 0000000..f170e23 --- /dev/null +++ b/src/brightdata/datasets/__init__.py @@ -0,0 +1,52 @@ +""" +Bright Data Datasets API client. + +Access pre-collected datasets and filter records. +""" + +from .client import DatasetsClient +from .base import BaseDataset, DatasetError +from .models import DatasetInfo, DatasetField, DatasetMetadata, SnapshotStatus +from .utils import export, export_json, export_jsonl, export_csv + +# Platform-specific datasets +from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles +from .amazon import AmazonProducts +from .crunchbase import CrunchbaseCompanies +from .imdb import IMDBMovies +from .nba import NBAPlayersStats +from .goodreads import GoodreadsBooks +from .world_population import WorldPopulation + +__all__ = [ + # Client + "DatasetsClient", + # Base + "BaseDataset", + "DatasetError", + # Models + "DatasetInfo", + "DatasetField", + "DatasetMetadata", + "SnapshotStatus", + # Utils + "export", + "export_json", + "export_jsonl", + "export_csv", + # LinkedIn + "LinkedInPeopleProfiles", + "LinkedInCompanyProfiles", + # Amazon + "AmazonProducts", + # Crunchbase + "CrunchbaseCompanies", + # IMDB + "IMDBMovies", + # NBA + "NBAPlayersStats", + # Goodreads + "GoodreadsBooks", + # World Population + "WorldPopulation", +] diff --git a/src/brightdata/datasets/amazon/__init__.py b/src/brightdata/datasets/amazon/__init__.py new file mode 100644 index 0000000..75ab6ed --- /dev/null +++ b/src/brightdata/datasets/amazon/__init__.py @@ -0,0 +1,5 @@ +"""Amazon datasets.""" + +from .products import AmazonProducts + +__all__ = ["AmazonProducts"] diff --git a/src/brightdata/datasets/amazon/products.py b/src/brightdata/datasets/amazon/products.py new file mode 100644 index 0000000..b2680d7 --- /dev/null +++ b/src/brightdata/datasets/amazon/products.py @@ -0,0 +1,412 @@ +""" +Amazon Products dataset. + +Dataset ID: gd_l7q7dkf244hwjntr0 + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonProducts(BaseDataset): + """ + Amazon Products dataset. + + Access Amazon product records with filtering. + + Example: + >>> products = client.datasets.amazon_products + >>> metadata = await products.get_metadata() + >>> snapshot_id = await products.filter( + ... filter={"name": "rating", "operator": ">", "value": 4.5}, + ... records_limit=100 + ... ) + >>> data = await products.download(snapshot_id) + """ + + DATASET_ID = "gd_l7q7dkf244hwjntr0" + NAME = "amazon_products" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str} + FIELDS: Dict[str, Dict[str, Any]] = { + # Core product identification + "title": { + "type": "text", + "description": "Product title/name", + }, + "asin": { + "type": "text", + "description": "Amazon Standard Identification Number", + }, + "parent_asin": { + "type": "text", + "description": "Parent ASIN for product variations", + }, + "input_asin": { + "type": "text", + "description": "Original input ASIN used for scraping", + }, + "url": { + "type": "url", + "description": "Full product page URL", + }, + "origin_url": { + "type": "url", + "description": "Original source URL", + }, + "domain": { + "type": "text", + "description": "Amazon domain (e.g., amazon.com)", + }, + # Brand & seller + "brand": { + "type": "text", + "description": "Product brand name", + }, + "seller_name": { + "type": "text", + "description": "Name of the seller", + }, + "seller_id": { + "type": "text", + "description": "Unique seller identifier", + }, + "seller_url": { + "type": "url", + "description": "URL to seller's storefront", + }, + "manufacturer": { + "type": "text", + "description": "Product manufacturer", + }, + "buybox_seller": { + "type": "text", + "description": "Current Buy Box winner seller", + }, + "number_of_sellers": { + "type": "number", + "description": "Number of sellers offering this product", + }, + "buybox_seller_rating": { + "type": "number", + "description": "Buy Box seller's rating", + }, + # Pricing + "initial_price": { + "type": "number", + "description": "Original/list price", + }, + "final_price": { + "type": "number", + "description": "Current selling price", + }, + "final_price_high": { + "type": "number", + "description": "High end of price range (for variations)", + }, + "currency": { + "type": "text", + "description": "Price currency code (e.g., USD)", + }, + "discount": { + "type": "text", + "description": "Discount percentage or amount", + }, + "buybox_prices": { + "type": "object", + "description": "Buy Box pricing details", + }, + "prices_breakdown": { + "type": "object", + "description": "Detailed price breakdown (list, deal, typical)", + }, + "other_sellers_prices": { + "type": "array", + "description": "Prices from other sellers", + }, + "coupon": { + "type": "text", + "description": "Available coupon code", + }, + "coupon_description": { + "type": "text", + "description": "Description of coupon discount", + }, + "inactive_buy_box": { + "type": "object", + "description": "Inactive Buy Box information", + }, + # Availability & shipping + "availability": { + "type": "text", + "description": "Stock availability status", + }, + "is_available": { + "type": "boolean", + "description": "Whether product is currently available", + }, + "max_quantity_available": { + "type": "number", + "description": "Maximum quantity available for purchase", + }, + "delivery": { + "type": "array", + "description": "Delivery options and dates", + }, + "ships_from": { + "type": "text", + "description": "Shipping origin location", + }, + "zipcode": { + "type": "text", + "description": "Delivery zipcode context", + }, + "city": { + "type": "text", + "description": "Delivery city context", + }, + "return_policy": { + "type": "text", + "description": "Return policy description", + }, + # Ratings & reviews + "rating": { + "type": "number", + "description": "Average star rating (0-5)", + }, + "reviews_count": { + "type": "number", + "description": "Total number of customer reviews", + }, + "answered_questions": { + "type": "number", + "description": "Number of answered Q&A", + }, + "top_review": { + "type": "text", + "description": "Featured/top customer review", + }, + "customer_says": { + "type": "text", + "description": "AI-generated customer sentiment summary", + }, + "customers_say": { + "type": "object", + "description": "Detailed customer feedback analysis", + }, + # Categories & rankings + "categories": { + "type": "array", + "description": "Product category hierarchy", + }, + "root_bs_category": { + "type": "text", + "description": "Root best seller category", + }, + "bs_category": { + "type": "text", + "description": "Best seller subcategory", + }, + "root_bs_rank": { + "type": "number", + "description": "Best seller rank in root category", + }, + "bs_rank": { + "type": "number", + "description": "Best seller rank in subcategory", + }, + "subcategory_rank": { + "type": "array", + "description": "Rankings in subcategories", + }, + "department": { + "type": "text", + "description": "Product department", + }, + # Badges & features + "badge": { + "type": "text", + "description": "Product badge (e.g., Best Seller)", + }, + "all_badges": { + "type": "array", + "description": "All product badges", + }, + "amazon_choice": { + "type": "boolean", + "description": "Whether product is Amazon's Choice", + }, + "amazon_prime": { + "type": "boolean", + "description": "Whether eligible for Prime", + }, + "premium_brand": { + "type": "boolean", + "description": "Whether a premium brand", + }, + "climate_pledge_friendly": { + "type": "boolean", + "description": "Climate Pledge Friendly certification", + }, + "sustainability_features": { + "type": "array", + "description": "Sustainability certifications and features", + }, + "sponsored": { + "type": "boolean", + "description": "Whether product listing is sponsored", + }, + # Product details + "description": { + "type": "text", + "description": "Short product description", + }, + "product_description": { + "type": "text", + "description": "Full product description", + }, + "features": { + "type": "array", + "description": "Product feature bullet points", + }, + "product_details": { + "type": "array", + "description": "Technical product specifications", + }, + "product_dimensions": { + "type": "text", + "description": "Product size dimensions", + }, + "item_weight": { + "type": "text", + "description": "Product weight", + }, + "model_number": { + "type": "text", + "description": "Manufacturer model number", + }, + "upc": { + "type": "text", + "description": "Universal Product Code", + }, + "ISBN10": { + "type": "text", + "description": "ISBN-10 for books", + }, + "ingredients": { + "type": "text", + "description": "Product ingredients (for applicable items)", + }, + "country_of_origin": { + "type": "text", + "description": "Country where product is made", + }, + "date_first_available": { + "type": "text", + "description": "Date product was first listed", + }, + "format": { + "type": "text", + "description": "Product format (for media items)", + }, + "language": { + "type": "text", + "description": "Product language", + }, + # Images & media + "image": { + "type": "url", + "description": "Main product image URL", + }, + "image_url": { + "type": "url", + "description": "Primary image URL", + }, + "images": { + "type": "array", + "description": "All product image URLs", + }, + "images_count": { + "type": "number", + "description": "Number of product images", + }, + "video": { + "type": "url", + "description": "Product video URL", + }, + "videos": { + "type": "array", + "description": "All product video URLs", + }, + "video_count": { + "type": "number", + "description": "Number of product videos", + }, + "downloadable_videos": { + "type": "array", + "description": "Downloadable video URLs", + }, + # Variations + "variations": { + "type": "array", + "description": "Product variations (size, color, etc.)", + }, + "variations_values": { + "type": "array", + "description": "Available variation options", + }, + # Enhanced content + "plus_content": { + "type": "boolean", + "description": "Whether has A+ Content", + }, + "from_the_brand": { + "type": "array", + "description": "Brand story/content section", + }, + "editorial_reviews": { + "type": "array", + "description": "Editorial review content", + }, + "about_the_author": { + "type": "text", + "description": "Author bio (for books)", + }, + # Store & purchase info + "store_url": { + "type": "url", + "description": "Brand store URL", + }, + "bought_past_month": { + "type": "number", + "description": "Units sold in past month", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_pricing_fields(cls) -> list: + """Get all pricing-related fields.""" + pricing_keywords = ["price", "cost", "discount", "coupon"] + return [ + name for name in cls.FIELDS.keys() if any(kw in name.lower() for kw in pricing_keywords) + ] diff --git a/src/brightdata/datasets/base.py b/src/brightdata/datasets/base.py new file mode 100644 index 0000000..62e008f --- /dev/null +++ b/src/brightdata/datasets/base.py @@ -0,0 +1,221 @@ +""" +Base dataset class - provides common functionality for all datasets. +""" + +import asyncio +import time +from typing import Dict, List, Any, Optional, Literal, TYPE_CHECKING + +from .models import DatasetMetadata, SnapshotStatus + +if TYPE_CHECKING: + from ..core.async_engine import AsyncEngine + + +class DatasetError(Exception): + """Error related to dataset operations.""" + + pass + + +class BaseDataset: + """ + Base class for all dataset types. + + Provides common methods: get_metadata(), filter(), get_status(), download(). + Subclasses set their own DATASET_ID and can add dataset-specific helpers. + """ + + BASE_URL = "https://api.brightdata.com" + DATASET_ID: str = "" # Override in subclasses + NAME: str = "" # Override in subclasses + + def __init__(self, engine: "AsyncEngine"): + self._engine = engine + self._metadata: Optional[DatasetMetadata] = None + + @property + def dataset_id(self) -> str: + return self.DATASET_ID + + @property + def name(self) -> str: + return self.NAME + + async def get_metadata(self) -> DatasetMetadata: + """ + Get dataset field schema. + + Returns field names, types, and descriptions for this dataset. + Use this to discover what fields you can filter by. + + Returns: + DatasetMetadata with fields dict + """ + if self._metadata is None: + async with self._engine.get_from_url( + f"{self.BASE_URL}/datasets/{self.DATASET_ID}/metadata" + ) as response: + data = await response.json() + self._metadata = DatasetMetadata.from_dict(data) + return self._metadata + + async def filter( + self, + filter: Dict[str, Any], + records_limit: Optional[int] = None, + ) -> str: + """ + Filter dataset records and create a snapshot. + + Returns snapshot_id immediately - does NOT wait for results. + Use download() to poll and get the data. + + Args: + filter: Filter criteria. Example: + {"name": "industry", "operator": "=", "value": "Technology"} + Or with AND/OR: + { + "operator": "and", + "filters": [ + {"name": "industry", "operator": "=", "value": "Technology"}, + {"name": "followers", "operator": ">", "value": 10000} + ] + } + records_limit: Maximum number of records to return + + Returns: + snapshot_id (str) - use with download() to get data + """ + payload: Dict[str, Any] = { + "dataset_id": self.DATASET_ID, + "filter": filter, + } + if records_limit is not None: + payload["records_limit"] = records_limit + + async with self._engine.post_to_url( + f"{self.BASE_URL}/datasets/filter", + json_data=payload, + ) as response: + data = await response.json() + return data["snapshot_id"] + + async def get_status(self, snapshot_id: str) -> SnapshotStatus: + """ + Check snapshot status. + + Args: + snapshot_id: Snapshot ID from filter() + + Returns: + SnapshotStatus with status field: "scheduled", "building", "ready", or "failed" + """ + async with self._engine.get_from_url( + f"{self.BASE_URL}/datasets/snapshots/{snapshot_id}" + ) as response: + data = await response.json() + return SnapshotStatus.from_dict(data) + + async def download( + self, + snapshot_id: str, + format: Literal["json", "jsonl", "csv"] = "jsonl", + timeout: int = 300, + poll_interval: int = 5, + ) -> List[Dict[str, Any]]: + """ + Download snapshot data. + + Polls until snapshot is ready, then downloads and returns data. + + Args: + snapshot_id: Snapshot ID from filter() + format: Response format (json, jsonl, csv) + timeout: Max seconds to wait for snapshot to be ready + poll_interval: Seconds between status checks + + Returns: + List of records (dicts) + + Raises: + DatasetError: If snapshot fails + TimeoutError: If snapshot not ready within timeout + """ + start_time = time.time() + + # Poll until ready + while True: + status = await self.get_status(snapshot_id) + + if status.status == "ready": + break + elif status.status == "failed": + raise DatasetError(f"Snapshot failed: {status.error}") + elif time.time() - start_time > timeout: + raise TimeoutError( + f"Snapshot {snapshot_id} not ready after {timeout}s " + f"(status: {status.status})" + ) + + await asyncio.sleep(poll_interval) + + # Download data + async with self._engine.get_from_url( + f"{self.BASE_URL}/datasets/snapshots/{snapshot_id}/download", + params={"format": format}, + ) as response: + import json + + # Check for HTTP errors + if response.status >= 400: + error_text = await response.text() + raise DatasetError(f"Download failed (HTTP {response.status}): {error_text}") + + # Get raw text first + text = await response.text() + + # Handle empty response + if not text or not text.strip(): + return [] + + # Try to parse based on content type and format + content_type = response.headers.get("Content-Type", "") + + # Try JSON first (most common) + if "application/json" in content_type or text.strip().startswith("["): + try: + data = json.loads(text) + except json.JSONDecodeError: + pass + else: + # Successfully parsed as JSON + if isinstance(data, list): + return data + elif isinstance(data, dict) and "data" in data: + return data["data"] + else: + return [data] if data else [] + + # Try JSONL (newline-delimited JSON) + if "ndjson" in content_type or format == "jsonl" or "\n" in text.strip(): + try: + lines = [line.strip() for line in text.strip().split("\n") if line.strip()] + if lines: + data = [json.loads(line) for line in lines] + return data + except json.JSONDecodeError: + pass + + # Last resort: try as single JSON object + try: + data = json.loads(text) + if isinstance(data, list): + return data + elif isinstance(data, dict) and "data" in data: + return data["data"] + else: + return [data] if data else [] + except json.JSONDecodeError: + # Return raw text as fallback + return [{"raw": text}] diff --git a/src/brightdata/datasets/client.py b/src/brightdata/datasets/client.py new file mode 100644 index 0000000..70081c2 --- /dev/null +++ b/src/brightdata/datasets/client.py @@ -0,0 +1,136 @@ +""" +Datasets client - main entry point for datasets API. +""" + +from typing import List, Optional, TYPE_CHECKING + +from .models import DatasetInfo +from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles +from .amazon import AmazonProducts +from .crunchbase import CrunchbaseCompanies +from .imdb import IMDBMovies +from .nba import NBAPlayersStats +from .goodreads import GoodreadsBooks +from .world_population import WorldPopulation + +if TYPE_CHECKING: + from ..core.async_engine import AsyncEngine + + +class DatasetsClient: + """ + Client for Bright Data Datasets API. + + Access pre-collected datasets and filter records. + + Usage: + async with BrightDataClient() as client: + # List all datasets + datasets = await client.datasets.list() + + # Get metadata for a specific dataset + metadata = await client.datasets.linkedin_profiles.get_metadata() + + # Filter records + snapshot_id = await client.datasets.linkedin_profiles.filter( + filter={"name": "industry", "operator": "=", "value": "Technology"}, + records_limit=100 + ) + + # Download results + data = await client.datasets.linkedin_profiles.download(snapshot_id) + """ + + BASE_URL = "https://api.brightdata.com" + + def __init__(self, engine: "AsyncEngine"): + self._engine = engine + + # Lazy-loaded dataset instances + self._linkedin_profiles: Optional[LinkedInPeopleProfiles] = None + self._linkedin_companies: Optional[LinkedInCompanyProfiles] = None + self._amazon_products: Optional[AmazonProducts] = None + self._crunchbase_companies: Optional[CrunchbaseCompanies] = None + self._imdb_movies: Optional[IMDBMovies] = None + self._nba_players_stats: Optional[NBAPlayersStats] = None + self._goodreads_books: Optional[GoodreadsBooks] = None + self._world_population: Optional[WorldPopulation] = None + + async def list(self) -> List[DatasetInfo]: + """ + List all available datasets. + + Returns: + List of DatasetInfo with id, name, and size + """ + async with self._engine.get_from_url(f"{self.BASE_URL}/datasets/list") as response: + data = await response.json() + + datasets = [] + for item in data: + datasets.append( + DatasetInfo( + id=item.get("id", ""), + name=item.get("name", ""), + size=item.get("size", 0), + ) + ) + return datasets + + # Dataset properties for IDE autocomplete + + @property + def linkedin_profiles(self) -> LinkedInPeopleProfiles: + """LinkedIn People Profiles dataset (620M+ records).""" + if self._linkedin_profiles is None: + self._linkedin_profiles = LinkedInPeopleProfiles(self._engine) + return self._linkedin_profiles + + @property + def linkedin_companies(self) -> LinkedInCompanyProfiles: + """LinkedIn Company Profiles dataset.""" + if self._linkedin_companies is None: + self._linkedin_companies = LinkedInCompanyProfiles(self._engine) + return self._linkedin_companies + + @property + def amazon_products(self) -> AmazonProducts: + """Amazon Products dataset.""" + if self._amazon_products is None: + self._amazon_products = AmazonProducts(self._engine) + return self._amazon_products + + @property + def crunchbase_companies(self) -> CrunchbaseCompanies: + """Crunchbase Companies dataset (2.3M+ records).""" + if self._crunchbase_companies is None: + self._crunchbase_companies = CrunchbaseCompanies(self._engine) + return self._crunchbase_companies + + @property + def imdb_movies(self) -> IMDBMovies: + """IMDB Movies dataset (867K+ records).""" + if self._imdb_movies is None: + self._imdb_movies = IMDBMovies(self._engine) + return self._imdb_movies + + @property + def nba_players_stats(self) -> NBAPlayersStats: + """NBA Players Stats dataset (17K+ records).""" + if self._nba_players_stats is None: + self._nba_players_stats = NBAPlayersStats(self._engine) + return self._nba_players_stats + + @property + def goodreads_books(self) -> GoodreadsBooks: + """Goodreads Books dataset.""" + if self._goodreads_books is None: + self._goodreads_books = GoodreadsBooks(self._engine) + return self._goodreads_books + + @property + def world_population(self) -> WorldPopulation: + """World Population dataset.""" + if self._world_population is None: + self._world_population = WorldPopulation(self._engine) + return self._world_population diff --git a/src/brightdata/datasets/crunchbase/__init__.py b/src/brightdata/datasets/crunchbase/__init__.py new file mode 100644 index 0000000..5d1fdcc --- /dev/null +++ b/src/brightdata/datasets/crunchbase/__init__.py @@ -0,0 +1,5 @@ +"""Crunchbase datasets.""" + +from .companies import CrunchbaseCompanies + +__all__ = ["CrunchbaseCompanies"] diff --git a/src/brightdata/datasets/crunchbase/companies.py b/src/brightdata/datasets/crunchbase/companies.py new file mode 100644 index 0000000..18e8051 --- /dev/null +++ b/src/brightdata/datasets/crunchbase/companies.py @@ -0,0 +1,602 @@ +""" +Crunchbase Companies dataset. + +Dataset ID: gd_l1vijqt9jfj7olije +Records: 2.3M+ companies + +See FIELDS dict for all filterable fields with descriptions and fill rates. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CrunchbaseCompanies(BaseDataset): + """ + Crunchbase Companies dataset. + + Access 2.3M+ Crunchbase company records with filtering. + + Example: + >>> companies = client.datasets.crunchbase_companies + >>> metadata = await companies.get_metadata() + >>> snapshot_id = await companies.filter( + ... filter={"name": "num_employees", "operator": ">", "value": 100}, + ... records_limit=100 + ... ) + >>> data = await companies.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vijqt9jfj7olije" + NAME = "crunchbase_companies" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str, "fill_rate": float} + FIELDS: Dict[str, Dict[str, Any]] = { + # Core identification + "name": { + "type": "text", + "description": "The name of the company", + "fill_rate": 100.00, + }, + "url": { + "type": "url", + "description": "The URL or web address associated with the company", + "fill_rate": 100.00, + }, + "id": { + "type": "text", + "description": "A unique identifier for each company in Crunchbase", + "fill_rate": 100.00, + }, + "uuid": { + "type": "text", + "description": "Universally unique identifier for the company", + "fill_rate": 100.00, + }, + "company_id": { + "type": "text", + "description": "A unique identifier for each company in Crunchbase", + "fill_rate": 99.07, + }, + "type": { + "type": "text", + "description": "Type of data entry", + "fill_rate": 100.00, + }, + # Company info + "about": { + "type": "text", + "description": "Overview or description of the company", + "fill_rate": 100.00, + }, + "full_description": { + "type": "text", + "description": "Detailed description of the company", + "fill_rate": 100.00, + }, + "company_overview": { + "type": "text", + "description": "Overview or description of the company", + "fill_rate": 99.07, + }, + "legal_name": { + "type": "text", + "description": "Legal name of the company", + "fill_rate": 59.62, + }, + "cb_rank": { + "type": "number", + "description": "Crunchbase rank assigned to the company", + "fill_rate": 97.02, + }, + "image": { + "type": "url", + "description": "Image or logo associated with the company", + "fill_rate": 94.22, + }, + # Status & type + "operating_status": { + "type": "text", + "description": "The current operating status of the company", + "fill_rate": 100.00, + }, + "company_type": { + "type": "text", + "description": "The type of company (eg, private, public)", + "fill_rate": 96.41, + }, + "ipo_status": { + "type": "text", + "description": "Status of the company regarding Initial Public Offering (IPO)", + "fill_rate": 99.94, + }, + "investor_type": { + "type": "text", + "description": "Type of investor", + "fill_rate": 0.00, + }, + # Location + "region": { + "type": "text", + "description": "The continent where the company's headquarters is located", + "fill_rate": 93.28, + }, + "country_code": { + "type": "text", + "description": "The country code where the company is located", + "fill_rate": 93.50, + }, + "hq_continent": { + "type": "text", + "description": "The continent where the company's headquarters is located", + "fill_rate": 92.59, + }, + "address": { + "type": "text", + "description": "Physical address of the company", + "fill_rate": 93.50, + }, + "location": { + "type": "array", + "description": "Location information for the company", + "fill_rate": 93.50, + "nested_fields": 2, + }, + "headquarters_regions": { + "type": "array", + "description": "Regions where the company has headquarters", + "fill_rate": 91.62, + "nested_fields": 2, + }, + # Industries & products + "industries": { + "type": "array", + "description": "Industries associated with the company", + "fill_rate": 94.51, + "nested_fields": 2, + }, + "total_active_products": { + "type": "number", + "description": "Total number of active products", + "fill_rate": 14.54, + }, + "siftery_products": { + "type": "array", + "description": "Products listed by Siftery", + "fill_rate": 14.45, + "nested_fields": 3, + }, + # Employees & contacts + "num_employees": { + "type": "text", + "description": "The number of employees in the company", + "fill_rate": 86.28, + }, + "num_employee_profiles": { + "type": "number", + "description": "Number of employee profiles associated with the company", + "fill_rate": 99.94, + }, + "number_of_employee_profiles": { + "type": "number", + "description": "Number of employee profiles associated with the company", + "fill_rate": 99.07, + }, + "num_contacts": { + "type": "number", + "description": "Total number of contacts associated with the company", + "fill_rate": 34.63, + }, + "number_of_contacts": { + "type": "number", + "description": "Total number of contacts associated with the company", + "fill_rate": 34.10, + }, + "num_contacts_linkedin": { + "type": "number", + "description": "Number of LinkedIn contacts", + "fill_rate": 34.64, + }, + "number_of_linkedin_contacts": { + "type": "number", + "description": "Number of LinkedIn contacts", + "fill_rate": 34.10, + }, + "contacts": { + "type": "array", + "description": "Contact information for the company", + "fill_rate": 46.38, + "nested_fields": 5, + }, + "current_employees": { + "type": "array", + "description": "Number of current employees", + "fill_rate": 25.29, + "nested_fields": 4, + }, + "num_alumni": { + "type": "number", + "description": "Total number of company alumni", + "fill_rate": 0.02, + }, + "alumni": { + "type": "array", + "description": "Information about company alumni", + "fill_rate": 0.61, + "nested_fields": 4, + }, + # Contact info + "website": { + "type": "text", + "description": "The official website of the company", + "fill_rate": 97.36, + }, + "contact_email": { + "type": "text", + "description": "Contact email address for the company", + "fill_rate": 74.28, + }, + "email_address": { + "type": "text", + "description": "Contact email address for the company", + "fill_rate": 73.56, + }, + "contact_phone": { + "type": "text", + "description": "Contact phone number for the company", + "fill_rate": 77.90, + }, + "phone_number": { + "type": "text", + "description": "Contact phone number for the company", + "fill_rate": 77.25, + }, + "social_media_links": { + "type": "array", + "description": "URLs of social media profiles associated with the company", + "fill_rate": 86.85, + }, + "socila_media_urls": { + "type": "array", + "description": "URLs of social media profiles associated with the company", + "fill_rate": 85.95, + }, + # Founding & dates + "founded_date": { + "type": "text", + "description": "The date when the company was founded", + "fill_rate": 2.42, + }, + # Funding & investments + "num_investors": { + "type": "number", + "description": "Number of investors in the company", + "fill_rate": 8.24, + }, + "investors": { + "type": "array", + "description": "List of investors in the company", + "fill_rate": 8.24, + "nested_fields": 6, + }, + "num_investments": { + "type": "number", + "description": "Total number of investments made by the company", + "fill_rate": 2.61, + }, + "investments": { + "type": "array", + "description": "Information about company investments", + "fill_rate": 2.61, + "nested_fields": 7, + }, + "num_investments_lead": { + "type": "number", + "description": "Number of investments led by the company", + "fill_rate": 1.40, + }, + "funding_rounds_list": { + "type": "array", + "description": "List of funding rounds", + "fill_rate": 10.06, + "nested_fields": 8, + }, + "funds_raised": { + "type": "array", + "description": "Total funds raised by the company", + "fill_rate": 2.61, + "nested_fields": 5, + }, + "num_funds": { + "type": "number", + "description": "Total number of funds", + "fill_rate": 0.31, + }, + "funds_list": { + "type": "array", + "description": "List of funds associated with the company", + "fill_rate": 0.31, + "nested_fields": 3, + }, + "num_diversity_spotlight_investments": { + "type": "number", + "description": "Number of diversity spotlight investments", + "fill_rate": 0.47, + }, + "diversity_investments": { + "type": "array", + "description": "Information about diversity investments", + "fill_rate": 0.47, + "nested_fields": 7, + }, + # Acquisitions & exits + "num_acquisitions": { + "type": "number", + "description": "Total number of acquisitions by the company", + "fill_rate": 1.88, + }, + "acquisitions": { + "type": "array", + "description": "Information about company acquisitions", + "fill_rate": 1.88, + "nested_fields": 4, + }, + "acquired_by": { + "type": "object", + "description": "Information about the acquiring entity", + "fill_rate": 4.57, + "nested_fields": 5, + }, + "num_exits": { + "type": "number", + "description": "Information about company exits", + "fill_rate": 0.06, + }, + "exits": { + "type": "array", + "description": "Information about company exits", + "fill_rate": 0.94, + "nested_fields": 4, + }, + # Organization structure + "num_sub_organizations": { + "type": "number", + "description": "Total number of sub-organizations", + "fill_rate": 0.53, + }, + "sub_organizations": { + "type": "array", + "description": "Sub-organizations associated with the company", + "fill_rate": 0.53, + "nested_fields": 4, + }, + "sub_organization_of": { + "type": "text", + "description": "Information about being a sub-organization of another entity", + "fill_rate": 0.80, + }, + # People + "founders": { + "type": "array", + "description": "Information about the founders of the company", + "fill_rate": 21.93, + "nested_fields": 3, + }, + "num_founder_alumni": { + "type": "number", + "description": "Total number of founder alumni", + "fill_rate": 0.01, + }, + "num_advisor_positions": { + "type": "number", + "description": "Number of advisory positions associated with the company", + "fill_rate": 3.51, + }, + "current_advisors": { + "type": "array", + "description": "List of current advisors for the company", + "fill_rate": 3.51, + "nested_fields": 4, + }, + "leadership_hire": { + "type": "array", + "description": "Leadership hiring information", + "fill_rate": 1.61, + "nested_fields": 4, + }, + "layoff": { + "type": "array", + "description": "Layoff information", + "fill_rate": 0.28, + "nested_fields": 4, + }, + "people_highlights": { + "type": "object", + "description": "Highlights of people associated with the company", + "fill_rate": 47.68, + "nested_fields": 3, + }, + # Technology + "active_tech_count": { + "type": "number", + "description": "Number of active technologies used by the company", + "fill_rate": 95.47, + }, + "builtwith_num_technologies_used": { + "type": "number", + "description": "Number of technologies the company is built with", + "fill_rate": 95.47, + }, + "built_with_num_technologies_used": { + "type": "number", + "description": "Number of technologies the company is built with", + "fill_rate": 94.61, + }, + "builtwith_tech": { + "type": "array", + "description": "Technologies used by the company", + "fill_rate": 93.77, + "nested_fields": 3, + }, + "built_with_tech": { + "type": "array", + "description": "Technologies used by the company", + "fill_rate": 92.91, + "nested_fields": 3, + }, + "technology_highlights": { + "type": "object", + "description": "Highlights of technologies used by the company", + "fill_rate": 96.06, + "nested_fields": 4, + }, + # Traffic & analytics + "monthly_visits": { + "type": "number", + "description": "Number of monthly website visits", + "fill_rate": 52.61, + }, + "monthly_visits_growth": { + "type": "number", + "description": "Growth in monthly visits", + "fill_rate": 44.34, + }, + "semrush_visits_latest_month": { + "type": "number", + "description": "Latest monthly visits data from SEMrush", + "fill_rate": 52.61, + }, + "semrush_visits_mom_pct": { + "type": "number", + "description": "Percentage growth in SEMrush visits", + "fill_rate": 44.34, + }, + "semrush_last_updated": { + "type": "text", + "description": "Last update date for SEMrush data", + "fill_rate": 52.61, + }, + "semrush_location_list": { + "type": "array", + "description": "List of locations according to SEMrush", + "fill_rate": 1.78, + "nested_fields": 5, + }, + # Third-party data + "bombora": { + "type": "array", + "description": "Bombora information", + "fill_rate": 21.27, + "nested_fields": 5, + }, + "bombora_last_updated": { + "type": "text", + "description": "Last update date for Bombora data", + "fill_rate": 24.26, + }, + "apptopia": { + "type": "array", + "description": "Apptopia data", + "fill_rate": 5.56, + "nested_fields": 4, + }, + "apptopia_total_downloads": { + "type": "number", + "description": "Total downloads according to Apptopia", + "fill_rate": 1.56, + }, + "apptopia_total_downloads_mom_pct": { + "type": "text", + "description": "Month-over-month percentage change in downloads", + "fill_rate": 1.35, + }, + "aberdeen_it_spend": { + "type": "object", + "description": "IT spending data from Aberdeen", + "fill_rate": 56.04, + "nested_fields": 3, + }, + "ipqwery": { + "type": "object", + "description": "IPQwery data", + "fill_rate": 8.42, + "nested_fields": 4, + }, + # Events & news + "num_event_appearances": { + "type": "number", + "description": "Number of appearances in events", + "fill_rate": 0.10, + }, + "event_appearances": { + "type": "array", + "description": "Number of times the company has appeared in events", + "fill_rate": 0.10, + "nested_fields": 5, + }, + "num_news": { + "type": "number", + "description": "Number of news articles related to the company", + "fill_rate": 0.53, + }, + "news": { + "type": "array", + "description": "News related to the company", + "fill_rate": 27.91, + "nested_fields": 6, + }, + # Lists & features + "featured_list": { + "type": "array", + "description": "Indicates if the company is featured on a list", + "fill_rate": 95.71, + "nested_fields": 4, + }, + "similar_companies": { + "type": "array", + "description": "List of companies similar to the specified company", + "fill_rate": 57.21, + "nested_fields": 2, + }, + # Financial highlights + "financials_highlights": { + "type": "object", + "description": "Highlights of financial data", + "fill_rate": 10.87, + "nested_fields": 4, + }, + "ipo_fields": { + "type": "object", + "description": "Information related to Initial Public Offering (IPO)", + "fill_rate": 1.49, + "nested_fields": 5, + }, + "stock_symbol": { + "type": "text", + "description": "Stock symbol associated with the company", + "fill_rate": 0.42, + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_high_fill_rate_fields(cls, min_rate: float = 50.0) -> list: + """Get fields with fill rate above threshold.""" + return [name for name, info in cls.FIELDS.items() if info.get("fill_rate", 0) >= min_rate] + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] diff --git a/src/brightdata/datasets/goodreads/__init__.py b/src/brightdata/datasets/goodreads/__init__.py new file mode 100644 index 0000000..4567b77 --- /dev/null +++ b/src/brightdata/datasets/goodreads/__init__.py @@ -0,0 +1,5 @@ +"""Goodreads dataset.""" + +from .books import GoodreadsBooks + +__all__ = ["GoodreadsBooks"] diff --git a/src/brightdata/datasets/goodreads/books.py b/src/brightdata/datasets/goodreads/books.py new file mode 100644 index 0000000..3c43689 --- /dev/null +++ b/src/brightdata/datasets/goodreads/books.py @@ -0,0 +1,121 @@ +""" +Goodreads Books dataset. + +Dataset ID: gd_lreq6ho72fhvovjj7a + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class GoodreadsBooks(BaseDataset): + """ + Goodreads Books dataset. + + Access Goodreads book records with filtering. + + Example: + >>> books = client.datasets.goodreads_books + >>> metadata = await books.get_metadata() + >>> snapshot_id = await books.filter( + ... filter={"name": "star_rating", "operator": ">", "value": 4.0}, + ... records_limit=100 + ... ) + >>> data = await books.download(snapshot_id) + """ + + DATASET_ID = "gd_lreq6ho72fhvovjj7a" + NAME = "goodreads_books" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Book identification + "id": { + "type": "text", + "description": "Goodreads book ID", + }, + "url": { + "type": "url", + "description": "Goodreads book page URL", + }, + "isbn": { + "type": "text", + "description": "ISBN number", + }, + # Book details + "name": { + "type": "text", + "description": "Book title", + }, + "author": { + "type": "array", + "description": "Author name(s)", + }, + "summary": { + "type": "text", + "description": "Book summary/description", + }, + "genres": { + "type": "array", + "description": "Book genres/categories", + }, + "first_published": { + "type": "text", + "description": "First publication date", + }, + # Ratings & reviews + "star_rating": { + "type": "number", + "description": "Average star rating (0-5)", + }, + "num_ratings": { + "type": "number", + "description": "Total number of ratings", + }, + "num_reviews": { + "type": "number", + "description": "Total number of reviews", + }, + "community_reviews": { + "type": "object", + "description": "Breakdown of reviews by star rating", + }, + # Author info + "about_author": { + "type": "object", + "description": "Author information (name, books, followers)", + }, + # Pricing + "kindle_price": { + "type": "text", + "description": "Kindle edition price", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_rating_fields(cls) -> list: + """Get all rating-related fields.""" + return [ + name + for name in cls.FIELDS.keys() + if "rating" in name.lower() or "review" in name.lower() + ] diff --git a/src/brightdata/datasets/imdb/__init__.py b/src/brightdata/datasets/imdb/__init__.py new file mode 100644 index 0000000..e0010a9 --- /dev/null +++ b/src/brightdata/datasets/imdb/__init__.py @@ -0,0 +1,5 @@ +"""IMDB dataset.""" + +from .movies import IMDBMovies + +__all__ = ["IMDBMovies"] diff --git a/src/brightdata/datasets/imdb/movies.py b/src/brightdata/datasets/imdb/movies.py new file mode 100644 index 0000000..a7face3 --- /dev/null +++ b/src/brightdata/datasets/imdb/movies.py @@ -0,0 +1,195 @@ +""" +IMDB Movies dataset. + +Dataset ID: gd_l1vikf2h1a4t6x8qzu + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class IMDBMovies(BaseDataset): + """ + IMDB Movies dataset. + + Access IMDB movie/TV records with filtering. + + Example: + >>> movies = client.datasets.imdb_movies + >>> metadata = await movies.get_metadata() + >>> snapshot_id = await movies.filter( + ... filter={"name": "imdb_rating", "operator": ">", "value": 8.0}, + ... records_limit=100 + ... ) + >>> data = await movies.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vikf2h1a4t6x8qzu" + NAME = "imdb_movies" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Core identification + "id": { + "type": "text", + "description": "IMDB title ID (e.g., tt5931912)", + }, + "title": { + "type": "text", + "description": "Movie/show title", + }, + "url": { + "type": "url", + "description": "IMDB page URL", + }, + "media_type": { + "type": "text", + "description": "Type of media (Feature Film, Documentary, etc.)", + }, + # Ratings & reviews + "imdb_rating": { + "type": "number", + "description": "IMDB rating (0-10)", + }, + "imdb_rating_count": { + "type": "number", + "description": "Number of IMDB ratings", + }, + "popularity": { + "type": "number", + "description": "Popularity score", + }, + "review_count": { + "type": "number", + "description": "Number of user reviews", + }, + "review_rating": { + "type": "number", + "description": "Average user review rating", + }, + "critics_review_count": { + "type": "number", + "description": "Number of critic reviews", + }, + "featured_review": { + "type": "text", + "description": "Featured user review text", + }, + # Content details + "genres": { + "type": "array", + "description": "List of genres (e.g., Drama, Comedy)", + }, + "presentation": { + "type": "text", + "description": "Short presentation/tagline", + }, + "storyline": { + "type": "text", + "description": "Plot summary/storyline", + }, + "comment": { + "type": "text", + "description": "Additional comments", + }, + # Cast & crew + "credit": { + "type": "array", + "description": "Credits (directors, writers, etc.)", + }, + "top_cast": { + "type": "array", + "description": "Top cast members with character names", + }, + # Release details + "details_release_date": { + "type": "text", + "description": "Release date", + }, + "details_countries_of_origin": { + "type": "text", + "description": "Countries of origin", + }, + "details_language": { + "type": "text", + "description": "Languages", + }, + "details_also_known_as": { + "type": "text", + "description": "Alternative titles", + }, + "details_filming_locations": { + "type": "text", + "description": "Filming locations", + }, + "details_production_companies": { + "type": "text", + "description": "Production companies", + }, + "details_official_site": { + "type": "url", + "description": "Official website URL", + }, + # Technical specs + "specs_color": { + "type": "text", + "description": "Color format (Color, Black and White)", + }, + "specs_sound_mix": { + "type": "text", + "description": "Sound mix format", + }, + "specs_aspect_ratio": { + "type": "text", + "description": "Aspect ratio", + }, + # Media + "poster_url": { + "type": "url", + "description": "Movie poster image URL", + }, + "videos": { + "type": "array", + "description": "Video links (trailers, clips)", + }, + "photos": { + "type": "array", + "description": "Photo gallery links", + }, + # Awards & box office + "awards": { + "type": "text", + "description": "Awards and nominations", + }, + "boxoffice_budget": { + "type": "text", + "description": "Production budget", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_rating_fields(cls) -> list: + """Get all rating-related fields.""" + rating_keywords = ["rating", "review", "score"] + return [ + name for name in cls.FIELDS.keys() if any(kw in name.lower() for kw in rating_keywords) + ] diff --git a/src/brightdata/datasets/linkedin/__init__.py b/src/brightdata/datasets/linkedin/__init__.py new file mode 100644 index 0000000..7b4eacf --- /dev/null +++ b/src/brightdata/datasets/linkedin/__init__.py @@ -0,0 +1,6 @@ +"""LinkedIn datasets.""" + +from .people_profiles import LinkedInPeopleProfiles +from .company_profiles import LinkedInCompanyProfiles + +__all__ = ["LinkedInPeopleProfiles", "LinkedInCompanyProfiles"] diff --git a/src/brightdata/datasets/linkedin/company_profiles.py b/src/brightdata/datasets/linkedin/company_profiles.py new file mode 100644 index 0000000..a4c4bf2 --- /dev/null +++ b/src/brightdata/datasets/linkedin/company_profiles.py @@ -0,0 +1,197 @@ +""" +LinkedIn Company Profiles dataset. + +Dataset ID: gd_l1vikfnt1wgvvqz95w +Records: 58.5M+ companies + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LinkedInCompanyProfiles(BaseDataset): + """ + LinkedIn Company Profiles dataset. + + Access 58.5M+ LinkedIn company records with filtering. + + Example: + >>> companies = client.datasets.linkedin_companies + >>> metadata = await companies.get_metadata() + >>> snapshot_id = await companies.filter( + ... filter={"name": "company_size", "operator": "=", "value": "1001-5000"}, + ... records_limit=100 + ... ) + >>> data = await companies.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vikfnt1wgvvqz95w" + NAME = "linkedin_company_profiles" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str} + FIELDS: Dict[str, Dict[str, Any]] = { + "id": { + "type": "text", + "description": "Unique identifier for the company profile (URL slug)", + }, + "name": { + "type": "text", + "description": "Company name", + }, + "country_code": { + "type": "text", + "description": "Two-letter country code (e.g., US, GB, FR)", + }, + "locations": { + "type": "array", + "description": "List of company office locations with addresses", + }, + "followers": { + "type": "number", + "description": "Number of LinkedIn followers", + }, + "employees_in_linkedin": { + "type": "number", + "description": "Number of employees with LinkedIn profiles", + }, + "about": { + "type": "text", + "description": "Company description/about section", + }, + "specialties": { + "type": "array", + "description": "List of company specialties/expertise areas", + }, + "company_size": { + "type": "text", + "description": "Employee count range (e.g., '1001-5000 employees')", + }, + "organization_type": { + "type": "text", + "description": "Type of organization (e.g., Public Company, Private)", + }, + "industries": { + "type": "text", + "description": "Primary industry classification", + }, + "website": { + "type": "url", + "description": "Company website URL", + }, + "crunchbase_url": { + "type": "url", + "description": "Link to Crunchbase profile if available", + }, + "founded": { + "type": "number", + "description": "Year the company was founded", + }, + "company_id": { + "type": "text", + "description": "LinkedIn numeric company ID", + }, + "employees": { + "type": "array", + "description": "List of employee profiles with basic info", + }, + "headquarters": { + "type": "text", + "description": "City/region of company headquarters", + }, + "image": { + "type": "url", + "description": "Company cover/banner image URL", + }, + "logo": { + "type": "url", + "description": "Company logo image URL", + }, + "similar": { + "type": "array", + "description": "Similar companies suggested by LinkedIn", + }, + "url": { + "type": "url", + "description": "Full LinkedIn company profile URL", + }, + "updates": { + "type": "array", + "description": "Recent company posts/updates", + }, + "slogan": { + "type": "text", + "description": "Company tagline or slogan", + }, + "affiliated": { + "type": "array", + "description": "Affiliated/subsidiary companies", + }, + "funding": { + "type": "object", + "description": "Funding information if available", + }, + "investors": { + "type": "array", + "description": "List of investors if available", + }, + "formatted_locations": { + "type": "array", + "description": "Formatted address strings for locations", + }, + "stock_info": { + "type": "object", + "description": "Stock ticker and exchange info for public companies", + }, + "get_directions_url": { + "type": "array", + "description": "Map/directions URLs for office locations", + }, + "description": { + "type": "text", + "description": "Brief company description with follower count", + }, + "additional_information": { + "type": "object", + "description": "Extra company details and metadata", + }, + "country_codes_array": { + "type": "array", + "description": "All country codes where company operates", + }, + "alumni": { + "type": "array", + "description": "Notable alumni from the company", + }, + "alumni_information": { + "type": "object", + "description": "Statistics about company alumni", + }, + "website_simplified": { + "type": "text", + "description": "Simplified/masked website domain", + }, + "unformatted_about": { + "type": "text", + "description": "Raw about text without formatting", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_text_fields(cls) -> list: + """Get fields that are text type (commonly used for filtering).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == "text"] diff --git a/src/brightdata/datasets/linkedin/people_profiles.py b/src/brightdata/datasets/linkedin/people_profiles.py new file mode 100644 index 0000000..6c90bca --- /dev/null +++ b/src/brightdata/datasets/linkedin/people_profiles.py @@ -0,0 +1,285 @@ +""" +LinkedIn People Profiles dataset. + +Dataset ID: gd_l1viktl72bvl7bjuj0 +Records: 620M+ profiles + +See FIELDS dict for all filterable fields with descriptions and fill rates. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LinkedInPeopleProfiles(BaseDataset): + """ + LinkedIn People Profiles dataset. + + Access 620M+ LinkedIn profile records with filtering. + + Example: + >>> profiles = client.datasets.linkedin_profiles + >>> metadata = await profiles.get_metadata() + >>> snapshot_id = await profiles.filter( + ... filter={"name": "industry", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await profiles.download(snapshot_id) + """ + + DATASET_ID = "gd_l1viktl72bvl7bjuj0" + NAME = "linkedin_people_profiles" + + # All available fields with metadata + # Format: field_name -> {"type": str, "description": str, "fill_rate": float} + FIELDS: Dict[str, Dict[str, Any]] = { + "id": { + "type": "text", + "description": "A unique identifier for the person's LinkedIn profile", + "fill_rate": 100.00, + }, + "name": { + "type": "text", + "description": "Profile name", + "fill_rate": 97.54, + }, + "first_name": { + "type": "text", + "description": "First name of the user", + "fill_rate": 95.10, + }, + "last_name": { + "type": "text", + "description": "Last name of the user", + "fill_rate": 94.80, + }, + "city": { + "type": "text", + "description": "Geographical location of the user", + "fill_rate": 96.30, + }, + "country_code": { + "type": "text", + "description": "Geographical location of the user", + "fill_rate": 97.11, + }, + "location": { + "type": "text", + "description": "Geographical location of the user", + "fill_rate": 61.93, + }, + "position": { + "type": "text", + "description": "The current job title or position of the profile", + "fill_rate": 91.23, + }, + "about": { + "type": "text", + "description": "A concise profile summary. May be truncated with '...'", + "fill_rate": 18.90, + }, + "url": { + "type": "url", + "description": "URL that links directly to the LinkedIn profile", + "fill_rate": 100.00, + }, + "input_url": { + "type": "url", + "description": "The URL that was entered when starting the scraping process", + "fill_rate": 100.00, + }, + "linkedin_id": { + "type": "text", + "description": "LinkedIn profile identifier", + "fill_rate": 100.00, + }, + "linkedin_num_id": { + "type": "text", + "description": "Numeric LinkedIn profile ID", + "fill_rate": 100.00, + }, + "avatar": { + "type": "url", + "description": "URL that links to the profile picture of the LinkedIn user", + "fill_rate": 96.28, + }, + "banner_image": { + "type": "url", + "description": "Banner image URL", + "fill_rate": 96.28, + }, + "default_avatar": { + "type": "boolean", + "description": "Is the avatar picture the default empty picture", + "fill_rate": 95.73, + }, + "followers": { + "type": "number", + "description": "How many users/companies following the profile", + "fill_rate": 71.39, + }, + "connections": { + "type": "number", + "description": "How many connections the profile has", + "fill_rate": 70.33, + }, + "recommendations_count": { + "type": "number", + "description": "Total number of recommendations received", + "fill_rate": 3.65, + }, + "influencer": { + "type": "boolean", + "description": "Indicator if the profile is marked as influencer", + "fill_rate": 46.06, + }, + "memorialized_account": { + "type": "boolean", + "description": "Boolean indicating if the account is memorialized", + "fill_rate": 99.44, + }, + # Current company fields + "current_company_name": { + "type": "text", + "description": "The name of the latest/current company of the profile", + "fill_rate": 69.60, + }, + "current_company_company_id": { + "type": "text", + "description": "The id of the latest/current company of the profile", + "fill_rate": 38.94, + }, + "current_company": { + "type": "object", + "description": "Current professional position info: company name, job title, company ID, industry", + "fill_rate": 100.00, + "nested_fields": 6, + }, + # Experience & Education + "experience": { + "type": "array", + "description": "Professional history: job titles, dates, companies, locations", + "fill_rate": 71.49, + "nested_fields": 16, + }, + "education": { + "type": "array", + "description": "Educational background: degree, field, start/end year", + "fill_rate": 41.97, + "nested_fields": 10, + }, + "educations_details": { + "type": "text", + "description": "Educational background as text", + "fill_rate": 42.08, + }, + # Activity & Posts + "posts": { + "type": "array", + "description": "User's last LinkedIn posts: title, date, URL", + "fill_rate": 1.27, + "nested_fields": 7, + }, + "activity": { + "type": "array", + "description": "Any activity the user has regarding posts", + "fill_rate": 32.95, + "nested_fields": 5, + }, + # Professional credentials + "certifications": { + "type": "array", + "description": "Licenses & Certifications", + "fill_rate": 8.35, + "nested_fields": 5, + }, + "courses": { + "type": "array", + "description": "Courses or educational programs undertaken", + "fill_rate": 2.55, + "nested_fields": 3, + }, + "languages": { + "type": "array", + "description": "User's language proficiencies", + "fill_rate": 9.19, + "nested_fields": 2, + }, + "publications": { + "type": "array", + "description": "Published works or presentations", + "fill_rate": 1.23, + "nested_fields": 4, + }, + "patents": { + "type": "array", + "description": "Patents filed or granted", + "fill_rate": 0.13, + "nested_fields": 4, + }, + "projects": { + "type": "array", + "description": "Professional or academic projects", + "fill_rate": 2.08, + "nested_fields": 4, + }, + "honors_and_awards": { + "type": "array", + "description": "Awards and recognitions received", + "fill_rate": 2.13, + "nested_fields": 4, + }, + # Social & Network + "recommendations": { + "type": "array", + "description": "Recommendations received from connections/colleagues", + "fill_rate": 3.61, + }, + "volunteer_experience": { + "type": "array", + "description": "Information related to volunteer work", + "fill_rate": 4.12, + "nested_fields": 8, + }, + "organizations": { + "type": "array", + "description": "Memberships in professional organizations", + "fill_rate": 1.78, + "nested_fields": 6, + }, + "people_also_viewed": { + "type": "array", + "description": "Profiles that viewers of this profile also viewed", + "fill_rate": 33.36, + "nested_fields": 4, + }, + "similar_profiles": { + "type": "array", + "description": "Profiles similar to the current one", + "fill_rate": 0.58, + "nested_fields": 4, + }, + "bio_links": { + "type": "array", + "description": "External links added to the bio", + "fill_rate": 2.96, + "nested_fields": 2, + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_high_fill_rate_fields(cls, min_rate: float = 50.0) -> list: + """Get fields with fill rate above threshold.""" + return [name for name, info in cls.FIELDS.items() if info.get("fill_rate", 0) >= min_rate] diff --git a/src/brightdata/datasets/models.py b/src/brightdata/datasets/models.py new file mode 100644 index 0000000..b2b7786 --- /dev/null +++ b/src/brightdata/datasets/models.py @@ -0,0 +1,73 @@ +""" +Data models for Datasets API responses. +""" + +from dataclasses import dataclass, field +from typing import Dict, Optional, Any, Literal + + +@dataclass +class DatasetInfo: + """Dataset info returned by list().""" + + id: str + name: str + size: int = 0 # record count + + +@dataclass +class DatasetField: + """Field metadata within a dataset.""" + + type: str # "text", "number", "url", "array", "object", "boolean" + active: bool = True + required: bool = False + description: Optional[str] = None + + +@dataclass +class DatasetMetadata: + """Dataset metadata returned by get_metadata().""" + + id: str + fields: Dict[str, DatasetField] = field(default_factory=dict) + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "DatasetMetadata": + """Create from API response.""" + fields = {} + for name, field_data in data.get("fields", {}).items(): + if isinstance(field_data, dict): + fields[name] = DatasetField( + type=field_data.get("type", "text"), + active=field_data.get("active", True), + required=field_data.get("required", False), + description=field_data.get("description"), + ) + return cls(id=data.get("id", ""), fields=fields) + + +@dataclass +class SnapshotStatus: + """Snapshot status returned by get_status().""" + + id: str + status: Literal["scheduled", "building", "ready", "failed"] + dataset_id: Optional[str] = None + dataset_size: Optional[int] = None # records in snapshot + file_size: Optional[int] = None # bytes + cost: Optional[float] = None + error: Optional[str] = None + + @classmethod + def from_dict(cls, data: Dict[str, Any]) -> "SnapshotStatus": + """Create from API response.""" + return cls( + id=data.get("id", data.get("snapshot_id", "")), + status=data.get("status", "scheduled"), + dataset_id=data.get("dataset_id"), + dataset_size=data.get("dataset_size"), + file_size=data.get("file_size"), + cost=data.get("cost"), + error=data.get("error", data.get("error_message")), + ) diff --git a/src/brightdata/datasets/nba/__init__.py b/src/brightdata/datasets/nba/__init__.py new file mode 100644 index 0000000..f1071ba --- /dev/null +++ b/src/brightdata/datasets/nba/__init__.py @@ -0,0 +1,5 @@ +"""NBA dataset.""" + +from .players_stats import NBAPlayersStats + +__all__ = ["NBAPlayersStats"] diff --git a/src/brightdata/datasets/nba/players_stats.py b/src/brightdata/datasets/nba/players_stats.py new file mode 100644 index 0000000..b4f5ca5 --- /dev/null +++ b/src/brightdata/datasets/nba/players_stats.py @@ -0,0 +1,136 @@ +""" +NBA Players Stats dataset. + +Dataset ID: gd_lrqirmftwxxatiorf + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class NBAPlayersStats(BaseDataset): + """ + NBA Players Stats dataset. + + Access NBA player statistics with filtering. + + Example: + >>> players = client.datasets.nba_players_stats + >>> metadata = await players.get_metadata() + >>> snapshot_id = await players.filter( + ... filter={"name": "player_points_per_game", "operator": ">", "value": 20}, + ... records_limit=100 + ... ) + >>> data = await players.download(snapshot_id) + """ + + DATASET_ID = "gd_lrqirmftwxxatiorf" + NAME = "nba_players_stats" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Player identification + "url": { + "type": "url", + "description": "ESPN player stats page URL", + }, + "player_name": { + "type": "text", + "description": "Player full name", + }, + "team": { + "type": "text", + "description": "Team abbreviation (e.g., LAL, GSW)", + }, + # Season info + "season_year": { + "type": "text", + "description": "Season year (e.g., 2024-25)", + }, + "season_type": { + "type": "text", + "description": "Season type (Regular, Playoffs)", + }, + # Games + "player_games_played": { + "type": "number", + "description": "Number of games played", + }, + "player_games_started": { + "type": "number", + "description": "Number of games started", + }, + "player_minutes_per_game": { + "type": "number", + "description": "Minutes played per game", + }, + # Scoring + "player_points_per_game": { + "type": "number", + "description": "Points scored per game", + }, + # Rebounds + "player_offensive_rebounds_per_game": { + "type": "number", + "description": "Offensive rebounds per game", + }, + "player_defensive_rebounds_per_game": { + "type": "number", + "description": "Defensive rebounds per game", + }, + "player_rebounds_per_game": { + "type": "number", + "description": "Total rebounds per game", + }, + # Assists & turnovers + "player_assists_per_game": { + "type": "number", + "description": "Assists per game", + }, + "player_turnovers_per_game": { + "type": "number", + "description": "Turnovers per game", + }, + "player_assist_to_turnover_ratio": { + "type": "number", + "description": "Assist to turnover ratio", + }, + # Defense + "player_steals_per_game": { + "type": "number", + "description": "Steals per game", + }, + "player_blocks_per_game": { + "type": "number", + "description": "Blocks per game", + }, + # Fouls + "player_fouls_per_game": { + "type": "number", + "description": "Personal fouls per game", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_per_game_stats(cls) -> list: + """Get all per-game statistics fields.""" + return [name for name in cls.FIELDS.keys() if "per_game" in name.lower()] diff --git a/src/brightdata/datasets/utils.py b/src/brightdata/datasets/utils.py new file mode 100644 index 0000000..c5b70a5 --- /dev/null +++ b/src/brightdata/datasets/utils.py @@ -0,0 +1,139 @@ +""" +Dataset utilities - helpers for exporting and processing dataset results. +""" + +import json +import csv +from pathlib import Path +from typing import List, Dict, Any, Union, Optional + + +def export_json( + data: List[Dict[str, Any]], + filepath: Union[str, Path], + indent: int = 2, +) -> Path: + """ + Export dataset results to JSON file. + + Args: + data: List of records from download() + filepath: Output file path + indent: JSON indentation (default: 2) + + Returns: + Path to the created file + """ + filepath = Path(filepath) + with open(filepath, "w", encoding="utf-8") as f: + json.dump(data, f, indent=indent, default=str, ensure_ascii=False) + return filepath + + +def export_jsonl( + data: List[Dict[str, Any]], + filepath: Union[str, Path], +) -> Path: + """ + Export dataset results to JSONL (newline-delimited JSON) file. + + Args: + data: List of records from download() + filepath: Output file path + + Returns: + Path to the created file + """ + filepath = Path(filepath) + with open(filepath, "w", encoding="utf-8") as f: + for record in data: + f.write(json.dumps(record, default=str, ensure_ascii=False) + "\n") + return filepath + + +def export_csv( + data: List[Dict[str, Any]], + filepath: Union[str, Path], + fields: Optional[List[str]] = None, + flatten_nested: bool = True, +) -> Path: + """ + Export dataset results to CSV file. + + Args: + data: List of records from download() + filepath: Output file path + fields: Specific fields to export (default: all fields from first record) + flatten_nested: Convert nested objects/arrays to JSON strings (default: True) + + Returns: + Path to the created file + """ + if not data: + filepath = Path(filepath) + filepath.touch() + return filepath + + filepath = Path(filepath) + + # Determine fields + if fields is None: + fields = list(data[0].keys()) + + # Process data + processed_data = [] + for record in data: + row = {} + for field in fields: + value = record.get(field) + if flatten_nested and isinstance(value, (dict, list)): + value = json.dumps(value, default=str, ensure_ascii=False) + row[field] = value + processed_data.append(row) + + # Write CSV + with open(filepath, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fields) + writer.writeheader() + writer.writerows(processed_data) + + return filepath + + +def export( + data: List[Dict[str, Any]], + filepath: Union[str, Path], + **kwargs, +) -> Path: + """ + Export dataset results to file. Format is auto-detected from extension. + + Supported formats: + - .json: JSON format + - .jsonl, .ndjson: JSONL (newline-delimited JSON) + - .csv: CSV format + + Args: + data: List of records from download() + filepath: Output file path (extension determines format) + **kwargs: Additional arguments passed to format-specific exporter + + Returns: + Path to the created file + + Raises: + ValueError: If file extension is not supported + """ + filepath = Path(filepath) + ext = filepath.suffix.lower() + + if ext == ".json": + return export_json(data, filepath, **kwargs) + elif ext in (".jsonl", ".ndjson"): + return export_jsonl(data, filepath) + elif ext == ".csv": + return export_csv(data, filepath, **kwargs) + else: + raise ValueError( + f"Unsupported file extension: {ext}. " f"Supported: .json, .jsonl, .ndjson, .csv" + ) diff --git a/src/brightdata/datasets/world_population/__init__.py b/src/brightdata/datasets/world_population/__init__.py new file mode 100644 index 0000000..c27d252 --- /dev/null +++ b/src/brightdata/datasets/world_population/__init__.py @@ -0,0 +1,5 @@ +"""World Population dataset.""" + +from .countries import WorldPopulation + +__all__ = ["WorldPopulation"] diff --git a/src/brightdata/datasets/world_population/countries.py b/src/brightdata/datasets/world_population/countries.py new file mode 100644 index 0000000..44833b7 --- /dev/null +++ b/src/brightdata/datasets/world_population/countries.py @@ -0,0 +1,155 @@ +""" +World Population dataset. + +Dataset ID: gd_lrqeq7u3bil0pmelk + +See FIELDS dict for all filterable fields with descriptions. +""" + +from typing import TYPE_CHECKING, Dict, Any + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WorldPopulation(BaseDataset): + """ + World Population dataset. + + Access world population statistics by country with filtering. + + Example: + >>> population = client.datasets.world_population + >>> metadata = await population.get_metadata() + >>> snapshot_id = await population.filter( + ... filter={"name": "continent", "operator": "=", "value": "Europe"}, + ... records_limit=100 + ... ) + >>> data = await population.download(snapshot_id) + """ + + DATASET_ID = "gd_lrqeq7u3bil0pmelk" + NAME = "world_population" + + # All available fields with metadata + FIELDS: Dict[str, Dict[str, Any]] = { + # Country identification + "url": { + "type": "url", + "description": "Country page URL", + }, + "country": { + "type": "text", + "description": "Country name", + }, + "abbreviation": { + "type": "text", + "description": "Country code (e.g., USA, GBR)", + }, + "flag_image": { + "type": "url", + "description": "Country flag image URL", + }, + # Geographic info + "capital": { + "type": "text", + "description": "Capital city", + }, + "continent": { + "type": "text", + "description": "Continent name", + }, + "regions": { + "type": "array", + "description": "Geographic regions", + }, + "largest_cities": { + "type": "array", + "description": "Largest cities in the country", + }, + # Area + "country_area": { + "type": "number", + "description": "Total area (km²)", + }, + "country_land_area": { + "type": "number", + "description": "Land area (km²)", + }, + "country_density": { + "type": "number", + "description": "Population density per km²", + }, + # Population + "last_year_population": { + "type": "number", + "description": "Population from last year", + }, + "country_population_rank": { + "type": "number", + "description": "World population rank", + }, + "population_world_percentage": { + "type": "number", + "description": "Percentage of world population", + }, + "population_by_year": { + "type": "object", + "description": "Historical population data by year", + }, + # Population changes + "annual_population_growth": { + "type": "text", + "description": "Annual population growth rate and count", + }, + "population_change": { + "type": "number", + "description": "Total population change", + }, + "net_change_per_day": { + "type": "number", + "description": "Net population change per day", + }, + # Demographics + "births_per_day": { + "type": "number", + "description": "Average births per day", + }, + "deaths_per_day": { + "type": "number", + "description": "Average deaths per day", + }, + "emigrations_per_day": { + "type": "number", + "description": "Average emigrations per day", + }, + } + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + + @classmethod + def get_field_names(cls) -> list: + """Get list of all field names.""" + return list(cls.FIELDS.keys()) + + @classmethod + def get_fields_by_type(cls, field_type: str) -> list: + """Get fields of a specific type (text, number, array, object, url, boolean).""" + return [name for name, info in cls.FIELDS.items() if info.get("type") == field_type] + + @classmethod + def get_population_fields(cls) -> list: + """Get all population-related fields.""" + return [name for name in cls.FIELDS.keys() if "population" in name.lower()] + + @classmethod + def get_demographic_fields(cls) -> list: + """Get demographic fields (births, deaths, migrations).""" + return [ + name + for name in cls.FIELDS.keys() + if any(kw in name.lower() for kw in ["birth", "death", "emigration", "change"]) + ] diff --git a/src/brightdata/utils/ssl_helpers.py b/src/brightdata/utils/ssl_helpers.py index 482966f..a709651 100644 --- a/src/brightdata/utils/ssl_helpers.py +++ b/src/brightdata/utils/ssl_helpers.py @@ -92,7 +92,7 @@ def get_ssl_error_message(error: Exception) -> str: if is_macos(): fix_instructions = """ - + To fix this on macOS, try one of the following: 1. Install/upgrade certifi: @@ -112,7 +112,7 @@ def get_ssl_error_message(error: Exception) -> str: """ else: fix_instructions = """ - + To fix this, try: 1. Install/upgrade certifi: From 3e3fe104a8b8bf0aba781e694f10bbcdcacd1021 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 9 Feb 2026 17:03:52 +0300 Subject: [PATCH 2/5] Simplify API: remove .filter(), use callable datasets --- CHANGELOG.md | 2 +- MANIFEST.in | 1 - notebooks/datasets/amazon/amazon.ipynb | 8 ++++---- notebooks/datasets/crunchbase/crunchbase.ipynb | 14 +++++++------- notebooks/datasets/linkedin/linkedin.ipynb | 6 +++--- requirements-dev.txt | 1 - src/brightdata/datasets/base.py | 9 +++++---- 7 files changed, 20 insertions(+), 21 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee5c5dd..662b366 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ Access Bright Data's pre-collected datasets with filtering and export capabiliti ```python async with BrightDataClient() as client: # Filter dataset records - snapshot_id = await client.datasets.amazon_products.filter( + snapshot_id = await client.datasets.amazon_products( filter={"name": "rating", "operator": ">=", "value": 4.5}, records_limit=100 ) diff --git a/MANIFEST.in b/MANIFEST.in index 37ee2c5..63958da 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -4,4 +4,3 @@ include CHANGELOG.md include pyproject.toml recursive-include src *.py recursive-include src *.typed - diff --git a/notebooks/datasets/amazon/amazon.ipynb b/notebooks/datasets/amazon/amazon.ipynb index 9ca9e47..f5680c7 100644 --- a/notebooks/datasets/amazon/amazon.ipynb +++ b/notebooks/datasets/amazon/amazon.ipynb @@ -217,7 +217,7 @@ "print(f\"Records limit: {LIMIT}\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=FILTER,\n", " records_limit=LIMIT\n", " )\n", @@ -304,7 +304,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=PRICE_FILTER,\n", " records_limit=5\n", " )\n", @@ -358,7 +358,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=PRIME_FILTER,\n", " records_limit=5\n", " )\n", @@ -413,7 +413,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.amazon_products.filter(\n", + " snapshot_id = await client.datasets.amazon_products(\n", " filter=BRAND_FILTER,\n", " records_limit=5\n", " )\n", diff --git a/notebooks/datasets/crunchbase/crunchbase.ipynb b/notebooks/datasets/crunchbase/crunchbase.ipynb index 0babefd..c87ea2c 100644 --- a/notebooks/datasets/crunchbase/crunchbase.ipynb +++ b/notebooks/datasets/crunchbase/crunchbase.ipynb @@ -188,7 +188,7 @@ "print(f\"Records limit: {LIMIT}\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=FILTER,\n", " records_limit=LIMIT\n", " )\n", @@ -232,7 +232,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=EMPLOYEE_FILTER,\n", " records_limit=5\n", " )\n", @@ -276,7 +276,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=COUNTRY_FILTER,\n", " records_limit=5\n", " )\n", @@ -319,7 +319,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=FUNDED_FILTER,\n", " records_limit=5\n", " )\n", @@ -365,7 +365,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=IPO_FILTER,\n", " records_limit=5\n", " )\n", @@ -416,7 +416,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=FUNDED_FILTER,\n", " records_limit=5\n", " )\n", @@ -458,7 +458,7 @@ "print(f\"Records limit: 5\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.crunchbase_companies.filter(\n", + " snapshot_id = await client.datasets.crunchbase_companies(\n", " filter=IPO_FILTER,\n", " records_limit=5\n", " )\n", diff --git a/notebooks/datasets/linkedin/linkedin.ipynb b/notebooks/datasets/linkedin/linkedin.ipynb index d439317..ee90d6a 100644 --- a/notebooks/datasets/linkedin/linkedin.ipynb +++ b/notebooks/datasets/linkedin/linkedin.ipynb @@ -431,7 +431,7 @@ "print(f\"Records limit: {LIMIT}\\n\")\n", "\n", "async with client:\n", - " snapshot_id = await client.datasets.linkedin_profiles.filter(\n", + " snapshot_id = await client.datasets.linkedin_profiles(\n", " filter=FILTER,\n", " records_limit=LIMIT\n", " )\n", @@ -586,7 +586,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Step 1: Create filter\nCOMBINED_FILTER = {\n \"operator\": \"and\",\n \"filters\": [\n {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n ]\n}\n\nprint(\"Filter: US-based profiles with 5000+ followers\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_profiles.filter(\n filter=COMBINED_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + "source": "# Step 1: Create filter\nCOMBINED_FILTER = {\n \"operator\": \"and\",\n \"filters\": [\n {\"name\": \"country_code\", \"operator\": \"=\", \"value\": \"US\"},\n {\"name\": \"followers\", \"operator\": \">\", \"value\": 5000}\n ]\n}\n\nprint(\"Filter: US-based profiles with 5000+ followers\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_profiles(\n filter=COMBINED_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" }, { "cell_type": "code", @@ -610,7 +610,7 @@ "execution_count": null, "metadata": {}, "outputs": [], - "source": "# Step 1: Create filter\nCOMPANY_FILTER = {\n \"name\": \"company_size\",\n \"operator\": \"=\",\n \"value\": \"1001-5000 employees\"\n}\n\nprint(f\"Filter: {COMPANY_FILTER}\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_companies.filter(\n filter=COMPANY_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" + "source": "# Step 1: Create filter\nCOMPANY_FILTER = {\n \"name\": \"company_size\",\n \"operator\": \"=\",\n \"value\": \"1001-5000 employees\"\n}\n\nprint(f\"Filter: {COMPANY_FILTER}\")\nprint(f\"Records limit: 5\\n\")\n\nasync with client:\n snapshot_id = await client.datasets.linkedin_companies(\n filter=COMPANY_FILTER,\n records_limit=5\n )\n\nprint(f\"Snapshot created: {snapshot_id}\")" }, { "cell_type": "code", diff --git a/requirements-dev.txt b/requirements-dev.txt index 5fc90a0..431ef73 100644 --- a/requirements-dev.txt +++ b/requirements-dev.txt @@ -7,4 +7,3 @@ black>=23.0.0 ruff>=0.1.0 mypy>=1.5.0 pre-commit>=3.4.0 - diff --git a/src/brightdata/datasets/base.py b/src/brightdata/datasets/base.py index 62e008f..0517ce6 100644 --- a/src/brightdata/datasets/base.py +++ b/src/brightdata/datasets/base.py @@ -22,7 +22,8 @@ class BaseDataset: """ Base class for all dataset types. - Provides common methods: get_metadata(), filter(), get_status(), download(). + Provides common methods: get_metadata(), get_status(), download(). + Call the dataset directly to filter: await dataset(filter=..., records_limit=...) Subclasses set their own DATASET_ID and can add dataset-specific helpers. """ @@ -60,7 +61,7 @@ async def get_metadata(self) -> DatasetMetadata: self._metadata = DatasetMetadata.from_dict(data) return self._metadata - async def filter( + async def __call__( self, filter: Dict[str, Any], records_limit: Optional[int] = None, @@ -106,7 +107,7 @@ async def get_status(self, snapshot_id: str) -> SnapshotStatus: Check snapshot status. Args: - snapshot_id: Snapshot ID from filter() + snapshot_id: Snapshot ID from calling the dataset Returns: SnapshotStatus with status field: "scheduled", "building", "ready", or "failed" @@ -130,7 +131,7 @@ async def download( Polls until snapshot is ready, then downloads and returns data. Args: - snapshot_id: Snapshot ID from filter() + snapshot_id: Snapshot ID from calling the dataset format: Response format (json, jsonl, csv) timeout: Max seconds to wait for snapshot to be ready poll_interval: Seconds between status checks From 111f895794a3455da5e24c2351330d8b4e7c223e Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Sun, 15 Feb 2026 15:27:11 +0300 Subject: [PATCH 3/5] feat(datasets): add 92 new dataset integrations Add platform-specific dataset classes for: - Luxury brands: Loewe, Berluti, Moynat, Hermes, Delvaux, Prada, Montblanc, YSL, Dior, Balenciaga, Bottega Veneta, Celine, Chanel, Fendi - E-commerce: Amazon (Reviews, Sellers), Walmart, Shopee, Lazada, Zalando, Sephora, Zara, Mango, Massimo Dutti, Asos, Shein, Ikea, H&M, Lego, Mouser, Digikey - Social media: Instagram (Profiles, Posts), TikTok, Pinterest (Posts, Profiles), YouTube (Profiles, Videos, Comments), Facebook Pages Posts - Real estate: Zillow, Airbnb, Australia Real Estate, Otodom Poland, Zonaprop Argentina, Metrocuadrado, Infocasas Uruguay, Properati, Toctoc, Inmuebles24 Mexico, Yapo Chile - Business data: Glassdoor (Companies, Reviews, Jobs), Indeed (Companies, Jobs), ZoomInfo, PitchBook, G2, Trustpilot, TrustRadius, Owler, Slintel, Manta, VentureRadar, Companies Enriched, Employees Enriched - Other: World Zipcodes, US Lawyers, Google Maps Reviews, Yelp, Xing Profiles, OLX Brazil, Webmotors Brasil, Chileautos, LinkedIn Jobs Total: 100 datasets now implemented (92 new + 8 existing) --- .github/workflows/lint.yml | 33 - .github/workflows/publish.yml | 30 - .gitignore | 1 + notebooks/03_serp.ipynb | 220 ++++- notebooks/serp_results.json | 83 ++ src/brightdata/api/serp/base.py | 222 ++++- src/brightdata/api/serp/url_builder.py | 6 + src/brightdata/datasets/__init__.py | 254 +++++- src/brightdata/datasets/airbnb/__init__.py | 5 + src/brightdata/datasets/airbnb/properties.py | 25 + src/brightdata/datasets/amazon/__init__.py | 4 +- src/brightdata/datasets/amazon/reviews.py | 25 + src/brightdata/datasets/amazon/sellers.py | 25 + .../datasets/american_eagle/__init__.py | 5 + .../datasets/american_eagle/products.py | 25 + .../datasets/ashley_furniture/__init__.py | 5 + .../datasets/ashley_furniture/products.py | 25 + src/brightdata/datasets/asos/__init__.py | 5 + src/brightdata/datasets/asos/products.py | 25 + .../datasets/balenciaga/__init__.py | 5 + .../datasets/balenciaga/products.py | 25 + src/brightdata/datasets/berluti/__init__.py | 5 + src/brightdata/datasets/berluti/products.py | 25 + .../datasets/bottegaveneta/__init__.py | 5 + .../datasets/bottegaveneta/products.py | 25 + src/brightdata/datasets/carters/__init__.py | 5 + src/brightdata/datasets/carters/products.py | 25 + src/brightdata/datasets/celine/__init__.py | 5 + src/brightdata/datasets/celine/products.py | 25 + src/brightdata/datasets/chanel/__init__.py | 5 + src/brightdata/datasets/chanel/products.py | 25 + .../datasets/chileautos/__init__.py | 5 + src/brightdata/datasets/chileautos/cars.py | 25 + src/brightdata/datasets/client.py | 819 +++++++++++++++++- .../datasets/companies_enriched/__init__.py | 5 + .../datasets/companies_enriched/companies.py | 307 +++++++ .../datasets/crateandbarrel/__init__.py | 5 + .../datasets/crateandbarrel/products.py | 25 + src/brightdata/datasets/delvaux/__init__.py | 5 + src/brightdata/datasets/delvaux/products.py | 25 + src/brightdata/datasets/digikey/__init__.py | 5 + src/brightdata/datasets/digikey/products.py | 25 + src/brightdata/datasets/dior/__init__.py | 5 + src/brightdata/datasets/dior/products.py | 25 + .../datasets/employees_enriched/__init__.py | 5 + .../datasets/employees_enriched/employees.py | 250 ++++++ src/brightdata/datasets/facebook/__init__.py | 5 + .../datasets/facebook/pages_posts.py | 25 + src/brightdata/datasets/fanatics/__init__.py | 5 + src/brightdata/datasets/fanatics/products.py | 25 + src/brightdata/datasets/fendi/__init__.py | 5 + src/brightdata/datasets/fendi/products.py | 25 + src/brightdata/datasets/g2/__init__.py | 6 + src/brightdata/datasets/g2/products.py | 216 +++++ src/brightdata/datasets/g2/reviews.py | 151 ++++ src/brightdata/datasets/glassdoor/__init__.py | 7 + .../datasets/glassdoor/companies.py | 301 +++++++ src/brightdata/datasets/glassdoor/jobs.py | 179 ++++ src/brightdata/datasets/glassdoor/reviews.py | 198 +++++ .../datasets/google_maps/__init__.py | 5 + .../datasets/google_maps/reviews.py | 198 +++++ src/brightdata/datasets/hermes/__init__.py | 5 + src/brightdata/datasets/hermes/products.py | 25 + src/brightdata/datasets/hm/__init__.py | 5 + src/brightdata/datasets/hm/products.py | 25 + src/brightdata/datasets/ikea/__init__.py | 5 + src/brightdata/datasets/ikea/products.py | 25 + src/brightdata/datasets/indeed/__init__.py | 6 + src/brightdata/datasets/indeed/companies.py | 198 +++++ src/brightdata/datasets/indeed/jobs.py | 39 + src/brightdata/datasets/infocasas/__init__.py | 5 + .../datasets/infocasas/properties.py | 25 + .../datasets/inmuebles24/__init__.py | 5 + .../datasets/inmuebles24/properties.py | 25 + src/brightdata/datasets/instagram/__init__.py | 6 + src/brightdata/datasets/instagram/posts.py | 25 + src/brightdata/datasets/instagram/profiles.py | 39 + src/brightdata/datasets/lawyers/__init__.py | 5 + src/brightdata/datasets/lawyers/us_lawyers.py | 217 +++++ src/brightdata/datasets/lazada/__init__.py | 5 + src/brightdata/datasets/lazada/products.py | 25 + src/brightdata/datasets/lazboy/__init__.py | 5 + src/brightdata/datasets/lazboy/products.py | 25 + src/brightdata/datasets/lego/__init__.py | 5 + src/brightdata/datasets/lego/products.py | 25 + src/brightdata/datasets/linkedin/__init__.py | 3 +- .../datasets/linkedin/job_listings.py | 142 +++ src/brightdata/datasets/llbean/__init__.py | 5 + src/brightdata/datasets/llbean/products.py | 25 + src/brightdata/datasets/loewe/__init__.py | 5 + src/brightdata/datasets/loewe/products.py | 25 + src/brightdata/datasets/mango/__init__.py | 5 + src/brightdata/datasets/mango/products.py | 25 + src/brightdata/datasets/manta/__init__.py | 5 + src/brightdata/datasets/manta/businesses.py | 190 ++++ .../datasets/massimo_dutti/__init__.py | 5 + .../datasets/massimo_dutti/products.py | 25 + .../datasets/mattressfirm/__init__.py | 5 + .../datasets/mattressfirm/products.py | 25 + .../datasets/mediamarkt/__init__.py | 5 + .../datasets/mediamarkt/products.py | 35 + .../datasets/metrocuadrado/__init__.py | 5 + .../datasets/metrocuadrado/properties.py | 25 + src/brightdata/datasets/montblanc/__init__.py | 5 + src/brightdata/datasets/montblanc/products.py | 25 + src/brightdata/datasets/mouser/__init__.py | 5 + src/brightdata/datasets/mouser/products.py | 25 + src/brightdata/datasets/moynat/__init__.py | 5 + src/brightdata/datasets/moynat/products.py | 25 + src/brightdata/datasets/mybobs/__init__.py | 5 + src/brightdata/datasets/mybobs/products.py | 25 + src/brightdata/datasets/olx/__init__.py | 5 + src/brightdata/datasets/olx/ads.py | 25 + src/brightdata/datasets/otodom/__init__.py | 5 + src/brightdata/datasets/otodom/properties.py | 25 + src/brightdata/datasets/owler/__init__.py | 5 + src/brightdata/datasets/owler/companies.py | 232 +++++ src/brightdata/datasets/pinterest/__init__.py | 6 + src/brightdata/datasets/pinterest/posts.py | 25 + src/brightdata/datasets/pinterest/profiles.py | 25 + src/brightdata/datasets/pitchbook/__init__.py | 5 + .../datasets/pitchbook/companies.py | 171 ++++ src/brightdata/datasets/prada/__init__.py | 5 + src/brightdata/datasets/prada/products.py | 25 + src/brightdata/datasets/properati/__init__.py | 5 + .../datasets/properati/properties.py | 25 + .../datasets/raymourflanigan/__init__.py | 5 + .../datasets/raymourflanigan/products.py | 25 + .../datasets/real_estate/__init__.py | 5 + .../datasets/real_estate/australia.py | 39 + src/brightdata/datasets/sephora/__init__.py | 5 + src/brightdata/datasets/sephora/products.py | 25 + src/brightdata/datasets/shein/__init__.py | 5 + src/brightdata/datasets/shein/products.py | 25 + src/brightdata/datasets/shopee/__init__.py | 5 + src/brightdata/datasets/shopee/products.py | 25 + .../datasets/sleepnumber/__init__.py | 5 + .../datasets/sleepnumber/products.py | 25 + src/brightdata/datasets/slintel/__init__.py | 5 + src/brightdata/datasets/slintel/companies.py | 152 ++++ src/brightdata/datasets/tiktok/__init__.py | 5 + src/brightdata/datasets/tiktok/profiles.py | 39 + src/brightdata/datasets/toctoc/__init__.py | 5 + src/brightdata/datasets/toctoc/properties.py | 25 + src/brightdata/datasets/toysrus/__init__.py | 5 + src/brightdata/datasets/toysrus/products.py | 25 + .../datasets/trustpilot/__init__.py | 5 + src/brightdata/datasets/trustpilot/reviews.py | 185 ++++ .../datasets/trustradius/__init__.py | 5 + .../datasets/trustradius/reviews.py | 211 +++++ .../datasets/ventureradar/__init__.py | 5 + .../datasets/ventureradar/companies.py | 173 ++++ src/brightdata/datasets/walmart/__init__.py | 5 + src/brightdata/datasets/walmart/products.py | 39 + src/brightdata/datasets/webmotors/__init__.py | 5 + src/brightdata/datasets/webmotors/vehicles.py | 25 + .../datasets/world_zipcodes/__init__.py | 5 + .../datasets/world_zipcodes/zipcodes.py | 25 + src/brightdata/datasets/xing/__init__.py | 5 + src/brightdata/datasets/xing/profiles.py | 153 ++++ src/brightdata/datasets/yapo/__init__.py | 5 + src/brightdata/datasets/yapo/ads.py | 25 + src/brightdata/datasets/yelp/__init__.py | 6 + src/brightdata/datasets/yelp/businesses.py | 187 ++++ src/brightdata/datasets/yelp/reviews.py | 151 ++++ src/brightdata/datasets/youtube/__init__.py | 7 + src/brightdata/datasets/youtube/comments.py | 25 + src/brightdata/datasets/youtube/profiles.py | 25 + src/brightdata/datasets/youtube/videos.py | 25 + src/brightdata/datasets/ysl/__init__.py | 5 + src/brightdata/datasets/ysl/products.py | 25 + src/brightdata/datasets/zalando/__init__.py | 5 + src/brightdata/datasets/zalando/products.py | 25 + src/brightdata/datasets/zara/__init__.py | 6 + src/brightdata/datasets/zara/home_products.py | 25 + src/brightdata/datasets/zara/products.py | 25 + src/brightdata/datasets/zillow/__init__.py | 5 + src/brightdata/datasets/zillow/properties.py | 25 + src/brightdata/datasets/zonaprop/__init__.py | 5 + .../datasets/zonaprop/properties.py | 25 + src/brightdata/datasets/zoominfo/__init__.py | 5 + src/brightdata/datasets/zoominfo/companies.py | 202 +++++ 182 files changed, 8122 insertions(+), 152 deletions(-) delete mode 100644 .github/workflows/lint.yml delete mode 100644 .github/workflows/publish.yml create mode 100644 notebooks/serp_results.json create mode 100644 src/brightdata/datasets/airbnb/__init__.py create mode 100644 src/brightdata/datasets/airbnb/properties.py create mode 100644 src/brightdata/datasets/amazon/reviews.py create mode 100644 src/brightdata/datasets/amazon/sellers.py create mode 100644 src/brightdata/datasets/american_eagle/__init__.py create mode 100644 src/brightdata/datasets/american_eagle/products.py create mode 100644 src/brightdata/datasets/ashley_furniture/__init__.py create mode 100644 src/brightdata/datasets/ashley_furniture/products.py create mode 100644 src/brightdata/datasets/asos/__init__.py create mode 100644 src/brightdata/datasets/asos/products.py create mode 100644 src/brightdata/datasets/balenciaga/__init__.py create mode 100644 src/brightdata/datasets/balenciaga/products.py create mode 100644 src/brightdata/datasets/berluti/__init__.py create mode 100644 src/brightdata/datasets/berluti/products.py create mode 100644 src/brightdata/datasets/bottegaveneta/__init__.py create mode 100644 src/brightdata/datasets/bottegaveneta/products.py create mode 100644 src/brightdata/datasets/carters/__init__.py create mode 100644 src/brightdata/datasets/carters/products.py create mode 100644 src/brightdata/datasets/celine/__init__.py create mode 100644 src/brightdata/datasets/celine/products.py create mode 100644 src/brightdata/datasets/chanel/__init__.py create mode 100644 src/brightdata/datasets/chanel/products.py create mode 100644 src/brightdata/datasets/chileautos/__init__.py create mode 100644 src/brightdata/datasets/chileautos/cars.py create mode 100644 src/brightdata/datasets/companies_enriched/__init__.py create mode 100644 src/brightdata/datasets/companies_enriched/companies.py create mode 100644 src/brightdata/datasets/crateandbarrel/__init__.py create mode 100644 src/brightdata/datasets/crateandbarrel/products.py create mode 100644 src/brightdata/datasets/delvaux/__init__.py create mode 100644 src/brightdata/datasets/delvaux/products.py create mode 100644 src/brightdata/datasets/digikey/__init__.py create mode 100644 src/brightdata/datasets/digikey/products.py create mode 100644 src/brightdata/datasets/dior/__init__.py create mode 100644 src/brightdata/datasets/dior/products.py create mode 100644 src/brightdata/datasets/employees_enriched/__init__.py create mode 100644 src/brightdata/datasets/employees_enriched/employees.py create mode 100644 src/brightdata/datasets/facebook/__init__.py create mode 100644 src/brightdata/datasets/facebook/pages_posts.py create mode 100644 src/brightdata/datasets/fanatics/__init__.py create mode 100644 src/brightdata/datasets/fanatics/products.py create mode 100644 src/brightdata/datasets/fendi/__init__.py create mode 100644 src/brightdata/datasets/fendi/products.py create mode 100644 src/brightdata/datasets/g2/__init__.py create mode 100644 src/brightdata/datasets/g2/products.py create mode 100644 src/brightdata/datasets/g2/reviews.py create mode 100644 src/brightdata/datasets/glassdoor/__init__.py create mode 100644 src/brightdata/datasets/glassdoor/companies.py create mode 100644 src/brightdata/datasets/glassdoor/jobs.py create mode 100644 src/brightdata/datasets/glassdoor/reviews.py create mode 100644 src/brightdata/datasets/google_maps/__init__.py create mode 100644 src/brightdata/datasets/google_maps/reviews.py create mode 100644 src/brightdata/datasets/hermes/__init__.py create mode 100644 src/brightdata/datasets/hermes/products.py create mode 100644 src/brightdata/datasets/hm/__init__.py create mode 100644 src/brightdata/datasets/hm/products.py create mode 100644 src/brightdata/datasets/ikea/__init__.py create mode 100644 src/brightdata/datasets/ikea/products.py create mode 100644 src/brightdata/datasets/indeed/__init__.py create mode 100644 src/brightdata/datasets/indeed/companies.py create mode 100644 src/brightdata/datasets/indeed/jobs.py create mode 100644 src/brightdata/datasets/infocasas/__init__.py create mode 100644 src/brightdata/datasets/infocasas/properties.py create mode 100644 src/brightdata/datasets/inmuebles24/__init__.py create mode 100644 src/brightdata/datasets/inmuebles24/properties.py create mode 100644 src/brightdata/datasets/instagram/__init__.py create mode 100644 src/brightdata/datasets/instagram/posts.py create mode 100644 src/brightdata/datasets/instagram/profiles.py create mode 100644 src/brightdata/datasets/lawyers/__init__.py create mode 100644 src/brightdata/datasets/lawyers/us_lawyers.py create mode 100644 src/brightdata/datasets/lazada/__init__.py create mode 100644 src/brightdata/datasets/lazada/products.py create mode 100644 src/brightdata/datasets/lazboy/__init__.py create mode 100644 src/brightdata/datasets/lazboy/products.py create mode 100644 src/brightdata/datasets/lego/__init__.py create mode 100644 src/brightdata/datasets/lego/products.py create mode 100644 src/brightdata/datasets/linkedin/job_listings.py create mode 100644 src/brightdata/datasets/llbean/__init__.py create mode 100644 src/brightdata/datasets/llbean/products.py create mode 100644 src/brightdata/datasets/loewe/__init__.py create mode 100644 src/brightdata/datasets/loewe/products.py create mode 100644 src/brightdata/datasets/mango/__init__.py create mode 100644 src/brightdata/datasets/mango/products.py create mode 100644 src/brightdata/datasets/manta/__init__.py create mode 100644 src/brightdata/datasets/manta/businesses.py create mode 100644 src/brightdata/datasets/massimo_dutti/__init__.py create mode 100644 src/brightdata/datasets/massimo_dutti/products.py create mode 100644 src/brightdata/datasets/mattressfirm/__init__.py create mode 100644 src/brightdata/datasets/mattressfirm/products.py create mode 100644 src/brightdata/datasets/mediamarkt/__init__.py create mode 100644 src/brightdata/datasets/mediamarkt/products.py create mode 100644 src/brightdata/datasets/metrocuadrado/__init__.py create mode 100644 src/brightdata/datasets/metrocuadrado/properties.py create mode 100644 src/brightdata/datasets/montblanc/__init__.py create mode 100644 src/brightdata/datasets/montblanc/products.py create mode 100644 src/brightdata/datasets/mouser/__init__.py create mode 100644 src/brightdata/datasets/mouser/products.py create mode 100644 src/brightdata/datasets/moynat/__init__.py create mode 100644 src/brightdata/datasets/moynat/products.py create mode 100644 src/brightdata/datasets/mybobs/__init__.py create mode 100644 src/brightdata/datasets/mybobs/products.py create mode 100644 src/brightdata/datasets/olx/__init__.py create mode 100644 src/brightdata/datasets/olx/ads.py create mode 100644 src/brightdata/datasets/otodom/__init__.py create mode 100644 src/brightdata/datasets/otodom/properties.py create mode 100644 src/brightdata/datasets/owler/__init__.py create mode 100644 src/brightdata/datasets/owler/companies.py create mode 100644 src/brightdata/datasets/pinterest/__init__.py create mode 100644 src/brightdata/datasets/pinterest/posts.py create mode 100644 src/brightdata/datasets/pinterest/profiles.py create mode 100644 src/brightdata/datasets/pitchbook/__init__.py create mode 100644 src/brightdata/datasets/pitchbook/companies.py create mode 100644 src/brightdata/datasets/prada/__init__.py create mode 100644 src/brightdata/datasets/prada/products.py create mode 100644 src/brightdata/datasets/properati/__init__.py create mode 100644 src/brightdata/datasets/properati/properties.py create mode 100644 src/brightdata/datasets/raymourflanigan/__init__.py create mode 100644 src/brightdata/datasets/raymourflanigan/products.py create mode 100644 src/brightdata/datasets/real_estate/__init__.py create mode 100644 src/brightdata/datasets/real_estate/australia.py create mode 100644 src/brightdata/datasets/sephora/__init__.py create mode 100644 src/brightdata/datasets/sephora/products.py create mode 100644 src/brightdata/datasets/shein/__init__.py create mode 100644 src/brightdata/datasets/shein/products.py create mode 100644 src/brightdata/datasets/shopee/__init__.py create mode 100644 src/brightdata/datasets/shopee/products.py create mode 100644 src/brightdata/datasets/sleepnumber/__init__.py create mode 100644 src/brightdata/datasets/sleepnumber/products.py create mode 100644 src/brightdata/datasets/slintel/__init__.py create mode 100644 src/brightdata/datasets/slintel/companies.py create mode 100644 src/brightdata/datasets/tiktok/__init__.py create mode 100644 src/brightdata/datasets/tiktok/profiles.py create mode 100644 src/brightdata/datasets/toctoc/__init__.py create mode 100644 src/brightdata/datasets/toctoc/properties.py create mode 100644 src/brightdata/datasets/toysrus/__init__.py create mode 100644 src/brightdata/datasets/toysrus/products.py create mode 100644 src/brightdata/datasets/trustpilot/__init__.py create mode 100644 src/brightdata/datasets/trustpilot/reviews.py create mode 100644 src/brightdata/datasets/trustradius/__init__.py create mode 100644 src/brightdata/datasets/trustradius/reviews.py create mode 100644 src/brightdata/datasets/ventureradar/__init__.py create mode 100644 src/brightdata/datasets/ventureradar/companies.py create mode 100644 src/brightdata/datasets/walmart/__init__.py create mode 100644 src/brightdata/datasets/walmart/products.py create mode 100644 src/brightdata/datasets/webmotors/__init__.py create mode 100644 src/brightdata/datasets/webmotors/vehicles.py create mode 100644 src/brightdata/datasets/world_zipcodes/__init__.py create mode 100644 src/brightdata/datasets/world_zipcodes/zipcodes.py create mode 100644 src/brightdata/datasets/xing/__init__.py create mode 100644 src/brightdata/datasets/xing/profiles.py create mode 100644 src/brightdata/datasets/yapo/__init__.py create mode 100644 src/brightdata/datasets/yapo/ads.py create mode 100644 src/brightdata/datasets/yelp/__init__.py create mode 100644 src/brightdata/datasets/yelp/businesses.py create mode 100644 src/brightdata/datasets/yelp/reviews.py create mode 100644 src/brightdata/datasets/youtube/__init__.py create mode 100644 src/brightdata/datasets/youtube/comments.py create mode 100644 src/brightdata/datasets/youtube/profiles.py create mode 100644 src/brightdata/datasets/youtube/videos.py create mode 100644 src/brightdata/datasets/ysl/__init__.py create mode 100644 src/brightdata/datasets/ysl/products.py create mode 100644 src/brightdata/datasets/zalando/__init__.py create mode 100644 src/brightdata/datasets/zalando/products.py create mode 100644 src/brightdata/datasets/zara/__init__.py create mode 100644 src/brightdata/datasets/zara/home_products.py create mode 100644 src/brightdata/datasets/zara/products.py create mode 100644 src/brightdata/datasets/zillow/__init__.py create mode 100644 src/brightdata/datasets/zillow/properties.py create mode 100644 src/brightdata/datasets/zonaprop/__init__.py create mode 100644 src/brightdata/datasets/zonaprop/properties.py create mode 100644 src/brightdata/datasets/zoominfo/__init__.py create mode 100644 src/brightdata/datasets/zoominfo/companies.py diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml deleted file mode 100644 index 40b72ee..0000000 --- a/.github/workflows/lint.yml +++ /dev/null @@ -1,33 +0,0 @@ -name: Lint - -on: - push: - branches: [ main, develop ] - pull_request: - branches: [ main, develop ] - -jobs: - lint: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install dependencies - run: | - python -m pip install --upgrade pip - pip install black ruff mypy - pip install types-requests aiohttp python-dotenv tldextract aiolimiter pydantic - - - name: Run black - run: black --check src tests - - - name: Run ruff - run: ruff check src tests - - - name: Run mypy (non-blocking) - run: mypy src --ignore-missing-imports || echo "⚠️ mypy found type issues (non-blocking)" - continue-on-error: true diff --git a/.github/workflows/publish.yml b/.github/workflows/publish.yml deleted file mode 100644 index a39c689..0000000 --- a/.github/workflows/publish.yml +++ /dev/null @@ -1,30 +0,0 @@ -name: Publish to PyPI - -on: - release: - types: [published] - -jobs: - publish: - runs-on: ubuntu-latest - steps: - - uses: actions/checkout@v4 - - name: Set up Python - uses: actions/setup-python@v5 - with: - python-version: "3.9" - - - name: Install build dependencies - run: | - python -m pip install --upgrade pip - pip install build twine - - - name: Build package - run: python -m build - - - name: Publish to PyPI - env: - TWINE_USERNAME: __token__ - TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} - run: twine upload dist/* - diff --git a/.gitignore b/.gitignore index babb9a6..d255b80 100644 --- a/.gitignore +++ b/.gitignore @@ -1,5 +1,6 @@ # Archived SDK versions and reference implementations archive/ +tests # Byte-compiled / optimized / DLL files __pycache__/ diff --git a/notebooks/03_serp.ipynb b/notebooks/03_serp.ipynb index 4a34ca5..fd46dd3 100644 --- a/notebooks/03_serp.ipynb +++ b/notebooks/03_serp.ipynb @@ -11,6 +11,7 @@ "- Location-specific results\n", "- Batch queries\n", "- Device type comparison\n", + "- Pagination (fetching more than 10 results)\n", "\n", "---" ] @@ -24,7 +25,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 1, "metadata": {}, "outputs": [ { @@ -58,7 +59,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -77,7 +78,36 @@ "client = BrightDataClient(token=API_TOKEN)\n", "\n", "print(\"Client initialized\")\n", - "print(f\"Default SERP zone: {client.serp_zone}\")" + "print(f\"Default SERP zone: {client.serp_zone}\")\n", + "\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Name: brightdata-sdk\n", + "Version: 2.1.2\n", + "Summary: Modern async-first Python SDK for Bright Data APIs\n", + "Home-page: https://github.com/brightdata/sdk-python\n", + "Author: Bright Data\n", + "Author-email: Bright Data \n", + "License: MIT\n", + "Location: /Users/ns/Desktop/projects/sdk-python/.venv/lib/python3.11/site-packages\n", + "Editable project location: /Users/ns/Desktop/projects/sdk-python\n", + "Requires: aiohttp, aiolimiter, click, pydantic, pydantic-settings, python-dotenv, requests, tldextract\n", + "Required-by: \n" + ] + } + ], + "source": [ + "!pip show brightdata-sdk " ] }, { @@ -92,7 +122,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -108,7 +138,7 @@ "\n", "--- Top 5 Results ---\n", "\n", - "1. The Python Tutorial — Python 3.14.2 documentation\n", + "1. The Python Tutorial — Python 3.14.3 documentation\n", " URL: https://docs.python.org/3/tutorial/index.html...\n", " Description: This tutorial introduces the reader informally to the basic concepts and feature...\n", "\n", @@ -120,13 +150,13 @@ " URL: https://www.python.org/about/gettingstarted/...\n", " Description: Welcome! Are you completely new to programming? If not then we presume you will ...\n", "\n", - "4. Python Tutorial\n", - " URL: https://www.tutorialspoint.com/python/index.htm...\n", - " Description: This Python tutorial gives a complete understanding of Python programming langua...\n", - "\n", - "5. Best Python tutorial for beginners in 2024? : r/learnpython\n", + "4. Best Python tutorial for beginners in 2024? : r/learnpython\n", " URL: https://www.reddit.com/r/learnpython/comments/1ajlvog/best_p...\n", - " Description: I'm almost done with The Complete Python Bootcamp from Zero to Hero in Python th...\n" + " Description: I'm almost done with The Complete Python Bootcamp from Zero to Hero in Python th...\n", + "\n", + "5. Learn Python - Free Interactive Python Tutorial\n", + " URL: https://www.learnpython.org/...\n", + " Description: Get started learning Python with DataCamp's free Intro to Python tutorial. Learn...\n" ] } ], @@ -183,26 +213,26 @@ "=== Location: New York, United States ===\n", " 1. Best Restaurants Near Me\n", " https://www.tripadvisor.com/Restaurants\n", - " 2. Book the best restaurants nearby\n", - " https://www.opentable.com/nearby\n", - " 3. TOP 10 BEST Restaurants in San Francisco, CA\n", - " https://www.yelp.com/search?find_desc=Restaurants&find_loc=San+Francisco%2C+CA\n", + " 2. Best Restaurants Near Me - February 2026\n", + " https://www.yelp.com/nearme/restaurants\n", + " 3. The 10 Best Restaurants Near Me in New York City, NY\n", + " https://www.opentable.com/nearby/restaurants-near-me-new-york-city\n", "\n", "=== Location: London, United Kingdom ===\n", " 1. Best Restaurants Near Me\n", " https://www.tripadvisor.com/Restaurants\n", - " 2. Book the best restaurants nearby\n", - " https://www.opentable.com/nearby\n", - " 3. 12 AMAZING Restaurants In Cherry Creek For Food Lovers ...\n", - " https://nomadicfoodist.com/best-restaurants-in-cherry-creek/\n", + " 2. TOP 10 BEST Restaurants near Bentonville, AR\n", + " https://www.yelp.com/c/bentonville-ar-us/restaurants\n", + " 3. The Best Restaurants Open Near Me - TheFork\n", + " https://www.thefork.com/near-me\n", "\n", "=== Location: Tokyo, Japan ===\n", " 1. Best Restaurants Near Me\n", " https://www.tripadvisor.com/Restaurants\n", - " 2. Best Restaurants Near Me - January 2026\n", - " https://www.yelp.com/nearme/restaurants\n", - " 3. 12 AMAZING Restaurants In Cherry Creek For Food Lovers ...\n", - " https://nomadicfoodist.com/best-restaurants-in-cherry-creek/\n", + " 2. 10 Best Punta Gorda Restaurants that Prove it's a Foodie ...\n", + " https://rtwin30days.com/best-punta-gorda-restaurants/\n", + " 3. The Best Places to Eat in St. Augustine, Florida\n", + " https://currentlytraveling.com/the-best-places-to-eat-in-st-augustine-florida/\n", "\n" ] } @@ -247,7 +277,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 8, "metadata": {}, "outputs": [ { @@ -263,16 +293,16 @@ "Results found: 8\n", " 1. Python Web Scraping Tutorial\n", " https://www.geeksforgeeks.org/python/python-web-scraping-tutorial/\n", - " 2. How to start Web scraping with python? : r/learnpython\n", - " https://www.reddit.com/r/learnpython/comments/qzr8ir/how_to_start_web_scraping_with_python/\n", + " 2. Beautiful Soup: Build a Web Scraper With Python\n", + " https://realpython.com/beautiful-soup-web-scraper-python/\n", "\n", "=== Query: 'javascript async await' ===\n", "Success: True\n", - "Results found: 8\n", + "Results found: 10\n", " 1. async function - JavaScript - MDN Web Docs\n", " https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Statements/async_function\n", - " 2. Async/await\n", - " https://javascript.info/async-await\n", + " 2. JavaScript async and await\n", + " https://www.w3schools.com/js/js_async.asp\n", "\n", "=== Query: 'data science tools' ===\n", "Success: True\n", @@ -332,7 +362,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -343,14 +373,14 @@ "Comparing desktop vs mobile results...\n", "\n", "=== Desktop ===\n", - " 1. Weather Forecast and Conditions for Austin, Texas\n", - " 2. National and Local Weather Radar, Daily Forecast, \n", - " 3. Weather Forecast and Conditions for New York City,\n", + " 1. National and Local Weather Radar, Daily Forecast, \n", + " 2. Weather Forecast and Conditions for New York City,\n", + " 3. Weather Forecast and Conditions for Austin, Texas\n", "\n", "=== Mobile ===\n", - " 1. Tunis, Tunisia Hourly Weather Forecast\n", - " 2. Tunis, Tunis, Tunisia Weather Forecast\n", - " 3. Tunis - BBC Weather\n" + " 1. Spain Current Weather\n", + " 2. Weather forecast by locations\n", + " 3. 10-Day Weather Forecast for Valdeacederas, Tetuán \n" ] } ], @@ -397,7 +427,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { @@ -413,10 +443,10 @@ "total_found: None\n", "\n", "=== Timing ===\n", - "trigger_sent_at: 2026-01-29 13:10:11.144061+00:00\n", - "data_fetched_at: 2026-01-29 13:10:13.947057+00:00\n", + "trigger_sent_at: 2026-02-11 17:55:31.882102+00:00\n", + "data_fetched_at: 2026-02-11 17:55:36.380537+00:00\n", "\n", - "Total time: 2.80 seconds\n" + "Total time: 4.50 seconds\n" ] } ], @@ -448,7 +478,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { @@ -486,6 +516,99 @@ " print(\"No successful results to export\")" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "---\n", + "## Test 7: Pagination (Fetching More Than 10 Results)\n", + "\n", + "Test sequential pagination to fetch more than one page of results.\n", + "Google returns ~10 results per page, so requesting 30+ results triggers automatic pagination." + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Query: 'machine learning frameworks'\n", + "Requesting 30 results (pagination will fetch ~3 pages)...\n", + "\n", + "Success: True\n", + "Results returned: 30\n", + "Total found (Google estimate): None\n", + "Results per page: 10\n", + "\n", + "--- All 30 Results ---\n", + " 1. Top 10 Machine Learning Frameworks in 2025\n", + " 2. Machine Learning Frameworks and Languages\n", + " 3. Machine Learning Framework Preferences in the Industry\n", + " 4. Machine and Deep Learning Frameworks\n", + " 5. What Are Machine Learning Frameworks?\n", + " 6. Top 10 Machine Learning Frameworks (2026)\n", + " 7. Machine learning frameworks\n", + " 8. Machine Learning Frameworks: Features, Use Cases ...\n", + " 9. Top Machine Learning Frameworks To Use\n", + "10. What Is a Machine Learning Pipeline?\n", + "11. josephmisiti/awesome-machine-learning: A curated list o\n", + "12. Effective Machine Learning Frameworks\n", + "13. The Top 16 AI Frameworks and Libraries: A Beginner's Gu\n", + "14. What Are Machine Learning Frameworks and How to Pick ..\n", + "15. YouTube\n", + "16. Top 20 Machine Learning Frameworks Of All Times\n", + "17. Comparison of ML Frameworks\n", + "18. Machine learning frameworks: Choosing the right one\n", + "19. 15 Popular Machine Learning Frameworks for Model ...\n", + "20. Best Machine Learning Frameworks(ML) for Experts in 202\n", + "21. PyTorch\n", + "22. Top Machine Learning Frameworks to Use\n", + "23. The State of Machine Learning Frameworks in 2019\n", + "24. Top 8 Deep Learning Frameworks You Should Know | 2025\n", + "25. Machine Learning and Deep Learning frameworks and ...\n", + "26. Deep Learning Frameworks\n", + "27. Machine Learning Frameworks: Features, Use Cases ...\n", + "28. Comparison of deep learning software\n", + "29. The Ultimate Guide to Machine Learning Frameworks\n", + "30. Machine learning frameworks: Choosing the right one\n" + ] + } + ], + "source": [ + "QUERY = \"machine learning frameworks\"\n", + "NUM_RESULTS = 30 # Request 30 results (3 pages)\n", + "\n", + "print(f\"Query: '{QUERY}'\")\n", + "print(f\"Requesting {NUM_RESULTS} results (pagination will fetch ~3 pages)...\\n\")\n", + "\n", + "async with client:\n", + " result = await client.search.google(\n", + " query=QUERY,\n", + " location=\"United States\",\n", + " num_results=NUM_RESULTS\n", + " )\n", + "\n", + "print(f\"Success: {result.success}\")\n", + "print(f\"Results returned: {len(result.data) if result.data else 0}\")\n", + "print(f\"Total found (Google estimate): {result.total_found}\")\n", + "print(f\"Results per page: {result.results_per_page}\")\n", + "\n", + "if result.error:\n", + " print(f\"Note: {result.error}\")\n", + "\n", + "if result.success and result.data:\n", + " print(f\"\\n--- All {len(result.data)} Results ---\")\n", + " for i, item in enumerate(result.data):\n", + " title = item.get('title', 'N/A')[:55]\n", + " print(f\"{i+1:2}. {title}\")\n", + "else:\n", + " print(f\"\\nError: {result.error}\")" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -507,9 +630,17 @@ "| `location` | Geographic location | `None` |\n", "| `language` | Language code (en, es, ru, etc.) | `\"en\"` |\n", "| `device` | `\"desktop\"`, `\"mobile\"`, `\"tablet\"` | `\"desktop\"` |\n", - "| `num_results` | Number of results | `10` |\n", + "| `num_results` | Number of results (pagination auto-enabled for >10) | `10` |\n", "| `zone` | SERP zone | `sdk_serp` |\n", "\n", + "### Pagination\n", + "\n", + "When `num_results > 10`, the SDK automatically paginates through multiple Google result pages:\n", + "- Google returns ~10 results per page\n", + "- SDK fetches pages sequentially until reaching `num_results`\n", + "- Maximum 20 pages (200 results) per request\n", + "- 5 minute timeout for paginated requests\n", + "\n", "### Response Fields\n", "\n", "| Field | Description |\n", @@ -518,8 +649,9 @@ "| `data` | List of search result items |\n", "| `search_engine` | Which engine was used |\n", "| `query` | The search query info |\n", - "| `total_found` | Estimated total results |\n", - "| `country` | Location used |\n", + "| `total_found` | Google's estimated total results |\n", + "| `results_per_page` | Results per page (10) |\n", + "| `error` | Error message (set for partial failures) |\n", "\n", "### Result Item Fields\n", "\n", diff --git a/notebooks/serp_results.json b/notebooks/serp_results.json new file mode 100644 index 0000000..c3b673d --- /dev/null +++ b/notebooks/serp_results.json @@ -0,0 +1,83 @@ +{ + "success": true, + "search_engine": "google", + "query": { + "q": "weather today", + "location": null, + "language": "en" + }, + "total_found": null, + "results_count": 10, + "results": [ + { + "position": 1, + "title": "Spain Current Weather", + "url": "https://www.accuweather.com/en/es/spain-weather", + "description": "Albacete 56\u00b0 Alicante 72\u00b0 Barcelona 66\u00b0 Bilbao 65\u00b0 C\u00f3rdoba 57\u00b0 Gij\u00f3n 65\u00b0 L'Hospitalet de Llobregat 66\u00b0 Las Palmas de Gran Canaria 66\u00b0 Logro\u00f1o 58\u00b0 Madrid 53\u00b0 ...Read more", + "displayed_url": "https://www.accuweather.com \u203a spain-weather" + }, + { + "position": 2, + "title": "Weather forecast by locations", + "url": "https://www.aemet.es/en/eltiempo/prediccion/municipios", + "description": "Weather forecasts for 8124 locations in Spain. They must be considered as the more probable tendency of the meteorological evolution for the next 7 days.Read more", + "displayed_url": "https://www.aemet.es \u203a prediccion \u203a municipios" + }, + { + "position": 3, + "title": "10-Day Weather Forecast for Valdeacederas, Tetu\u00e1n ...", + "url": "https://weather.com/weather/tenday/l/Valdeacederas+Tetu%C3%A1n+Madrid+Spain?canonicalCityId=663174d0ea8ac456fd4372d2bc0bfac8", + "description": "Clearing skies after some evening wind and light rain. Low 44F. Winds WSW at 20 to 30 mph. Chance of rain 40%. Humidity83%.Read more", + "displayed_url": "https://weather.com \u203a weather \u203a tenday \u203a Valdeacederas..." + }, + { + "position": 4, + "title": "Spain weather", + "url": "https://weather.metoffice.gov.uk/world/spain", + "description": "Madrid. Cloudy. 13\u00b0 \u00b7 Barcelona. Sunny day. 20\u00b0 \u00b7 Valencia. Sunny day. 23\u00b0 \u00b7 Seville. Drizzle. 18\u00b0 \u00b7 Zaragoza. Light shower (day). 14\u00b0 \u00b7 Malaga. Sunny day. 20\u00b0 ...Read more", + "displayed_url": "https://weather.metoffice.gov.uk \u203a world \u203a spain" + }, + { + "position": 5, + "title": "BBC Weather - Alicante", + "url": "https://www.bbc.com/weather/2521978", + "description": "Partly cloudy and light windsLight cloud and a gentle breezeSunny intervals and a gentle breezeSunny and a moderate breezeLight rain and a fresh breezeSunny ...Read more", + "displayed_url": "https://www.bbc.com \u203a weather" + }, + { + "position": 6, + "title": "weather forecast", + "url": "https://en.eltiempo.es/", + "description": "Today's weather in Spain and the world. Weather forecast for today tomorrow and the coming days. The forecast in 200000 cities.", + "displayed_url": "https://en.eltiempo.es" + }, + { + "position": 7, + "title": "Salt (Girona) - 7-Day weather forecast - Table", + "url": "https://www.aemet.es/en/eltiempo/prediccion/municipios/salt-id17155", + "description": "Weather forecast by locations. Salt (Girona) \u00b7 High clouds \u00b7 Cloudy intervals with light rain \u00b7 Cloudless sky \u00b7 Cloudless sky \u00b7 Cloudless sky \u00b7 Partly cloudy.Read more", + "displayed_url": "https://www.aemet.es \u203a municipios \u203a salt-id17155" + }, + { + "position": 8, + "title": "Barcelona, Spain Hourly Weather Forecast", + "url": "https://www.wunderground.com/hourly/es/barcelona", + "description": "Partly cloudy early. Mostly clear with gusty winds developing late. Low 56F. Winds W at 25 to 35 mph. Winds could occasionally gust over 40 mph.Read more", + "displayed_url": "https://www.wunderground.com \u203a hourly \u203a barcelona" + }, + { + "position": 9, + "title": "Fuengirola, Andalusia, Spain Weather Forecast", + "url": "https://www.accuweather.com/en/es/fuengirola/302104/weather-forecast/302104", + "description": "Hourly Weather \u00b7 1 PM 66\u00b0. rain drop 0% \u00b7 2 PM 67\u00b0. rain drop 0% \u00b7 3 PM 68\u00b0. rain drop 0% \u00b7 4 PM 69\u00b0. rain drop 0% \u00b7 5 PM 68\u00b0. rain drop 0% \u00b7 6 PM 67\u00b0. rain ...Read more", + "displayed_url": "https://www.accuweather.com \u203a fuengirola \u203a weather-fo..." + }, + { + "position": 10, + "title": "Weather in Spain", + "url": "https://www.timeanddate.com/weather/spain", + "description": "Weather in Spain (74 Locations). A Coru\u00f1a, Tue 1:41 am, Light rain. Overcast. Cool. 59 \u00b0F. Adeje, Tue 12:41 am, Passing clouds. Cool. 61 \u00b0F.Read more", + "displayed_url": "https://www.timeanddate.com \u203a weather \u203a spain" + } + ] +} diff --git a/src/brightdata/api/serp/base.py b/src/brightdata/api/serp/base.py index 37b35c1..a151e01 100644 --- a/src/brightdata/api/serp/base.py +++ b/src/brightdata/api/serp/base.py @@ -3,7 +3,10 @@ import asyncio import aiohttp import json -from typing import Union, List, Optional +import re +import time +import warnings +from typing import Union, List, Optional, Dict, Any, Tuple from datetime import datetime, timezone from .url_builder import BaseURLBuilder @@ -29,6 +32,9 @@ class BaseSERPService: SEARCH_ENGINE: str = "" ENDPOINT = "/request" DEFAULT_TIMEOUT = 30 + PAGE_SIZE = 10 + MAX_PAGES = 20 + PAGINATION_TIMEOUT = 300 def __init__( self, @@ -102,6 +108,16 @@ async def search( self._validate_zone(zone) self._validate_queries(query_list) + # Warn if pagination requested with async mode (not supported) + if mode == "async" and num_results > self.PAGE_SIZE and self.SEARCH_ENGINE == "google": + warnings.warn( + f"Pagination (num_results={num_results}) is not supported in async mode. " + f"Only first page (~{self.PAGE_SIZE} results) will be returned. " + f"Use mode='sync' for pagination support.", + UserWarning, + stacklevel=2, + ) + # Route based on mode if mode == "async": # Async mode: use unblocker endpoints with polling @@ -164,6 +180,19 @@ async def _search_single_async( **kwargs, ) -> SearchResult: """Execute single search query with retry logic.""" + # Route to pagination for Google when requesting more than one page + if num_results > self.PAGE_SIZE and self.SEARCH_ENGINE == "google": + return await self._search_with_pagination( + query=query, + zone=zone, + location=location, + language=language, + device=device, + num_results=num_results, + **kwargs, + ) + + # Single page request trigger_sent_at = datetime.now(timezone.utc) search_url = self.url_builder.build( @@ -175,8 +204,49 @@ async def _search_single_async( **kwargs, ) - # Use "json" format when brd_json=1 is in URL (enables Bright Data parsing) - # Otherwise use "raw" to get HTML response + raw_data, data_fetched_at, error = await self._execute_serp_request( + search_url=search_url, + zone=zone, + trigger_sent_at=trigger_sent_at, + ) + + if error: + return SearchResult( + success=False, + query={"q": query}, + error=f"Search failed: {error}", + search_engine=self.SEARCH_ENGINE, + trigger_sent_at=trigger_sent_at, + data_fetched_at=data_fetched_at, + ) + + normalized_data = self.data_normalizer.normalize(raw_data) + + return SearchResult( + success=True, + query={"q": query, "location": location, "language": language}, + data=normalized_data.get("results", []), + total_found=normalized_data.get("total_results"), + search_engine=self.SEARCH_ENGINE, + country=location, + results_per_page=num_results, + trigger_sent_at=trigger_sent_at, + data_fetched_at=data_fetched_at, + ) + + async def _execute_serp_request( + self, + search_url: str, + zone: str, + trigger_sent_at: datetime, + ) -> Tuple[Dict[str, Any], datetime, Optional[str]]: + """ + Execute a single SERP request and parse response. + + Returns: + Tuple of (raw_data, data_fetched_at, error) + If error is not None, raw_data will be empty dict. + """ response_format = "json" if "brd_json=1" in search_url else "raw" payload = { @@ -199,71 +269,145 @@ async def _make_request(): data_fetched_at = datetime.now(timezone.utc) if response.status == HTTP_OK: - # Try to parse response - could be direct JSON or wrapped in status_code/body text = await response.text() try: data = json.loads(text) except json.JSONDecodeError: - # Fallback to regular JSON response try: data = await response.json() except Exception: - # If all else fails, treat as raw text/HTML data = {"raw_html": text} # Handle wrapped response format (status_code/headers/body) if isinstance(data, dict) and "body" in data and "status_code" in data: - # This is a wrapped HTTP response - extract body body = data.get("body", "") if isinstance(body, str) and body.strip().startswith("<"): - # Body is HTML - pass to normalizer which will handle it data = {"body": body, "status_code": data.get("status_code")} else: - # Body might be JSON string - try to parse it try: data = json.loads(body) if isinstance(body, str) else body except (json.JSONDecodeError, TypeError): data = {"body": body, "status_code": data.get("status_code")} - normalized_data = self.data_normalizer.normalize(data) - - return SearchResult( - success=True, - query={"q": query, "location": location, "language": language}, - data=normalized_data.get("results", []), - total_found=normalized_data.get("total_results"), - search_engine=self.SEARCH_ENGINE, - country=location, - results_per_page=num_results, - trigger_sent_at=trigger_sent_at, - data_fetched_at=data_fetched_at, - ) + return (data, data_fetched_at, None) else: error_text = await response.text() + return ({}, data_fetched_at, f"HTTP {response.status}: {error_text}") + + try: + return await retry_with_backoff(_make_request, max_retries=self.max_retries) + except Exception as e: + return ({}, datetime.now(timezone.utc), f"Request error: {str(e)}") + + async def _search_with_pagination( + self, + query: str, + zone: str, + location: Optional[str], + language: str, + device: str, + num_results: int, + **kwargs, + ) -> SearchResult: + """ + Execute search with sequential pagination (Google only). + + Fetches pages one at a time until num_results reached or no more results. + """ + trigger_sent_at = datetime.now(timezone.utc) + pagination_start_time = time.time() + + all_results: List[Dict[str, Any]] = [] + pages_fetched = 0 + current_start = 0 + google_total_results = None + last_error = None + + while len(all_results) < num_results and pages_fetched < self.MAX_PAGES: + # Check total timeout + elapsed = time.time() - pagination_start_time + if elapsed > self.PAGINATION_TIMEOUT: + last_error = f"Pagination timeout after {int(elapsed)}s ({pages_fetched} pages)" + break + + # Build URL for current page + search_url = self.url_builder.build( + query=query, + location=location, + language=language, + device=device, + num_results=min(self.PAGE_SIZE, num_results - len(all_results)), + start=current_start, + **kwargs, + ) + + # Execute request + raw_data, data_fetched_at, error = await self._execute_serp_request( + search_url=search_url, + zone=zone, + trigger_sent_at=trigger_sent_at, + ) + + if error: + if pages_fetched == 0: return SearchResult( success=False, - query={"q": query}, - error=f"Search failed (HTTP {response.status}): {error_text}", + query={"q": query, "location": location, "language": language}, + error=f"Search failed: {error}", search_engine=self.SEARCH_ENGINE, trigger_sent_at=trigger_sent_at, data_fetched_at=data_fetched_at, ) + last_error = f"Page {pages_fetched + 1} failed: {error}" + break - try: - result = await retry_with_backoff( - _make_request, - max_retries=self.max_retries, - ) - return result - except Exception as e: - return SearchResult( - success=False, - query={"q": query}, - error=f"Search error: {str(e)}", - search_engine=self.SEARCH_ENGINE, - trigger_sent_at=trigger_sent_at, - data_fetched_at=datetime.now(timezone.utc), - ) + pages_fetched += 1 + + # Extract pagination info BEFORE normalizing + pagination = raw_data.get("pagination", {}) if isinstance(raw_data, dict) else {} + + # Normalize data + normalized_data = self.data_normalizer.normalize(raw_data) + page_results = normalized_data.get("results", []) + + if not page_results: + break + + # Preserve Google's total from first page + if pages_fetched == 1: + google_total_results = normalized_data.get("total_results") + + all_results.extend(page_results) + + # Determine next page offset + next_page_start = pagination.get("next_page_start") + + if next_page_start is None: + next_link = pagination.get("next_page_link", "") + if next_link: + match = re.search(r"start=(\d+)", next_link) + if match: + next_page_start = int(match.group(1)) + + if next_page_start is None or next_page_start <= current_start: + break + + current_start = next_page_start + + final_results = all_results[:num_results] + + return SearchResult( + success=True, + query={"q": query, "location": location, "language": language}, + data=final_results, + total_found=google_total_results, + search_engine=self.SEARCH_ENGINE, + country=location, + results_per_page=self.PAGE_SIZE, + trigger_sent_at=trigger_sent_at, + data_fetched_at=datetime.now(timezone.utc), + error=last_error, + ) async def _search_multiple_async( self, diff --git a/src/brightdata/api/serp/url_builder.py b/src/brightdata/api/serp/url_builder.py index ddb0203..5397177 100644 --- a/src/brightdata/api/serp/url_builder.py +++ b/src/brightdata/api/serp/url_builder.py @@ -33,11 +33,17 @@ def build( language: str = "en", device: str = "desktop", num_results: int = 10, + start: int = 0, **kwargs, ) -> str: """Build Google search URL with Bright Data parsing enabled.""" encoded_query = quote_plus(query) url = f"https://www.google.com/search?q={encoded_query}" + + # Add pagination offset if not first page + if start > 0: + url += f"&start={start}" + url += f"&num={num_results}" # Enable Bright Data SERP parsing diff --git a/src/brightdata/datasets/__init__.py b/src/brightdata/datasets/__init__.py index f170e23..b39c9ae 100644 --- a/src/brightdata/datasets/__init__.py +++ b/src/brightdata/datasets/__init__.py @@ -10,13 +10,92 @@ from .utils import export, export_json, export_jsonl, export_csv # Platform-specific datasets -from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles -from .amazon import AmazonProducts +from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles, LinkedInJobListings +from .amazon import AmazonProducts, AmazonReviews, AmazonSellersInfo from .crunchbase import CrunchbaseCompanies from .imdb import IMDBMovies from .nba import NBAPlayersStats from .goodreads import GoodreadsBooks from .world_population import WorldPopulation +from .companies_enriched import CompaniesEnriched +from .employees_enriched import EmployeesEnriched +from .glassdoor import GlassdoorCompanies, GlassdoorReviews, GlassdoorJobs +from .google_maps import GoogleMapsReviews +from .yelp import YelpBusinesses, YelpReviews +from .zoominfo import ZoomInfoCompanies +from .pitchbook import PitchBookCompanies +from .g2 import G2Products, G2Reviews +from .trustpilot import TrustpilotReviews +from .indeed import IndeedCompanies, IndeedJobs +from .xing import XingProfiles +from .slintel import SlintelCompanies +from .owler import OwlerCompanies +from .lawyers import USLawyers +from .manta import MantaBusinesses +from .ventureradar import VentureRadarCompanies +from .trustradius import TrustRadiusReviews +from .instagram import InstagramProfiles, InstagramPosts +from .tiktok import TikTokProfiles +from .real_estate import AustraliaRealEstate +from .walmart import WalmartProducts +from .mediamarkt import MediamarktProducts +from .fendi import FendiProducts +from .zalando import ZalandoProducts +from .sephora import SephoraProducts +from .zara import ZaraProducts, ZaraHomeProducts +from .mango import MangoProducts +from .massimo_dutti import MassimoDuttiProducts +from .otodom import OtodomPoland +from .webmotors import WebmotorsBrasil +from .airbnb import AirbnbProperties +from .asos import AsosProducts +from .chanel import ChanelProducts +from .ashley_furniture import AshleyFurnitureProducts +from .fanatics import FanaticsProducts +from .carters import CartersProducts +from .american_eagle import AmericanEagleProducts +from .ikea import IkeaProducts +from .hm import HMProducts +from .lego import LegoProducts +from .mattressfirm import MattressfirmProducts +from .crateandbarrel import CrateAndBarrelProducts +from .llbean import LLBeanProducts +from .shein import SheinProducts +from .toysrus import ToysRUsProducts +from .mybobs import MybobsProducts +from .sleepnumber import SleepNumberProducts +from .raymourflanigan import RaymourFlaniganProducts +from .inmuebles24 import Inmuebles24Mexico +from .mouser import MouserProducts +from .zillow import ZillowProperties +from .zonaprop import ZonapropArgentina +from .metrocuadrado import MetrocuadradoProperties +from .chileautos import ChileautosChile +from .infocasas import InfocasasUruguay +from .lazboy import LaZBoyProducts +from .properati import ProperatiProperties +from .yapo import YapoChile +from .toctoc import ToctocProperties +from .dior import DiorProducts +from .balenciaga import BalenciagaProducts +from .bottegaveneta import BottegaVenetaProducts +from .olx import OLXBrazil +from .celine import CelineProducts +from .loewe import LoeweProducts +from .berluti import BerlutiProducts +from .moynat import MoynatProducts +from .hermes import HermesProducts +from .delvaux import DelvauxProducts +from .prada import PradaProducts +from .montblanc import MontblancProducts +from .ysl import YSLProducts +from .world_zipcodes import WorldZipcodes +from .pinterest import PinterestPosts, PinterestProfiles +from .shopee import ShopeeProducts +from .lazada import LazadaProducts +from .youtube import YouTubeProfiles, YouTubeVideos, YouTubeComments +from .digikey import DigikeyProducts +from .facebook import FacebookPagesPosts __all__ = [ # Client @@ -37,8 +116,11 @@ # LinkedIn "LinkedInPeopleProfiles", "LinkedInCompanyProfiles", + "LinkedInJobListings", # Amazon "AmazonProducts", + "AmazonReviews", + "AmazonSellersInfo", # Crunchbase "CrunchbaseCompanies", # IMDB @@ -49,4 +131,172 @@ "GoodreadsBooks", # World Population "WorldPopulation", + # Companies Enriched + "CompaniesEnriched", + # Employees Enriched + "EmployeesEnriched", + # Glassdoor + "GlassdoorCompanies", + "GlassdoorReviews", + "GlassdoorJobs", + # Google Maps + "GoogleMapsReviews", + # Yelp + "YelpBusinesses", + "YelpReviews", + # ZoomInfo + "ZoomInfoCompanies", + # PitchBook + "PitchBookCompanies", + # G2 + "G2Products", + "G2Reviews", + # Trustpilot + "TrustpilotReviews", + # Indeed + "IndeedCompanies", + "IndeedJobs", + # Xing + "XingProfiles", + # Slintel + "SlintelCompanies", + # Owler + "OwlerCompanies", + # Lawyers + "USLawyers", + # Manta + "MantaBusinesses", + # VentureRadar + "VentureRadarCompanies", + # TrustRadius + "TrustRadiusReviews", + # Instagram + "InstagramProfiles", + "InstagramPosts", + # TikTok + "TikTokProfiles", + # Real Estate + "AustraliaRealEstate", + # Walmart + "WalmartProducts", + # Mediamarkt + "MediamarktProducts", + # Fendi + "FendiProducts", + # Zalando + "ZalandoProducts", + # Sephora + "SephoraProducts", + # Zara + "ZaraProducts", + "ZaraHomeProducts", + # Mango + "MangoProducts", + # Massimo Dutti + "MassimoDuttiProducts", + # Otodom + "OtodomPoland", + # Webmotors + "WebmotorsBrasil", + # Airbnb + "AirbnbProperties", + # Asos + "AsosProducts", + # Chanel + "ChanelProducts", + # Ashley Furniture + "AshleyFurnitureProducts", + # Fanatics + "FanaticsProducts", + # Carters + "CartersProducts", + # American Eagle + "AmericanEagleProducts", + # Ikea + "IkeaProducts", + # H&M + "HMProducts", + # Lego + "LegoProducts", + # Mattressfirm + "MattressfirmProducts", + # Crate and Barrel + "CrateAndBarrelProducts", + # L.L. Bean + "LLBeanProducts", + # Shein + "SheinProducts", + # Toys R Us + "ToysRUsProducts", + # Mybobs + "MybobsProducts", + # Sleep Number + "SleepNumberProducts", + # Raymour and Flanigan + "RaymourFlaniganProducts", + # Inmuebles24 + "Inmuebles24Mexico", + # Mouser + "MouserProducts", + # Zillow + "ZillowProperties", + # Zonaprop + "ZonapropArgentina", + # Metrocuadrado + "MetrocuadradoProperties", + # Chileautos + "ChileautosChile", + # Infocasas + "InfocasasUruguay", + # La-Z-Boy + "LaZBoyProducts", + # Properati + "ProperatiProperties", + # Yapo + "YapoChile", + # Toctoc + "ToctocProperties", + # Dior + "DiorProducts", + # Balenciaga + "BalenciagaProducts", + # Bottega Veneta + "BottegaVenetaProducts", + # OLX + "OLXBrazil", + # Celine + "CelineProducts", + # Loewe + "LoeweProducts", + # Berluti + "BerlutiProducts", + # Moynat + "MoynatProducts", + # Hermes + "HermesProducts", + # Delvaux + "DelvauxProducts", + # Prada + "PradaProducts", + # Montblanc + "MontblancProducts", + # YSL + "YSLProducts", + # World Zipcodes + "WorldZipcodes", + # Pinterest + "PinterestPosts", + "PinterestProfiles", + # Shopee + "ShopeeProducts", + # Lazada + "LazadaProducts", + # YouTube + "YouTubeProfiles", + "YouTubeVideos", + "YouTubeComments", + # Digikey + "DigikeyProducts", + # Facebook + "FacebookPagesPosts", ] diff --git a/src/brightdata/datasets/airbnb/__init__.py b/src/brightdata/datasets/airbnb/__init__.py new file mode 100644 index 0000000..54e360f --- /dev/null +++ b/src/brightdata/datasets/airbnb/__init__.py @@ -0,0 +1,5 @@ +"""Airbnb datasets.""" + +from .properties import AirbnbProperties + +__all__ = ["AirbnbProperties"] diff --git a/src/brightdata/datasets/airbnb/properties.py b/src/brightdata/datasets/airbnb/properties.py new file mode 100644 index 0000000..9a35869 --- /dev/null +++ b/src/brightdata/datasets/airbnb/properties.py @@ -0,0 +1,25 @@ +""" +Airbnb Properties dataset. + +Property listings from Airbnb. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AirbnbProperties(BaseDataset): + """Airbnb Properties dataset.""" + + DATASET_ID = "gd_ld7ll037kqy322v05" + NAME = "airbnb_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/amazon/__init__.py b/src/brightdata/datasets/amazon/__init__.py index 75ab6ed..5ab6998 100644 --- a/src/brightdata/datasets/amazon/__init__.py +++ b/src/brightdata/datasets/amazon/__init__.py @@ -1,5 +1,7 @@ """Amazon datasets.""" from .products import AmazonProducts +from .reviews import AmazonReviews +from .sellers import AmazonSellersInfo -__all__ = ["AmazonProducts"] +__all__ = ["AmazonProducts", "AmazonReviews", "AmazonSellersInfo"] diff --git a/src/brightdata/datasets/amazon/reviews.py b/src/brightdata/datasets/amazon/reviews.py new file mode 100644 index 0000000..256cd3b --- /dev/null +++ b/src/brightdata/datasets/amazon/reviews.py @@ -0,0 +1,25 @@ +""" +Amazon Reviews dataset. + +Product reviews from Amazon. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonReviews(BaseDataset): + """Amazon Reviews dataset.""" + + DATASET_ID = "gd_le8e811kzy4ggddlq" + NAME = "amazon_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/amazon/sellers.py b/src/brightdata/datasets/amazon/sellers.py new file mode 100644 index 0000000..417aa11 --- /dev/null +++ b/src/brightdata/datasets/amazon/sellers.py @@ -0,0 +1,25 @@ +""" +Amazon Sellers Info dataset. + +Seller information from Amazon marketplace. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmazonSellersInfo(BaseDataset): + """Amazon Sellers Info dataset.""" + + DATASET_ID = "gd_lhotzucw1etoe5iw1k" + NAME = "amazon_sellers_info" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/american_eagle/__init__.py b/src/brightdata/datasets/american_eagle/__init__.py new file mode 100644 index 0000000..45c615b --- /dev/null +++ b/src/brightdata/datasets/american_eagle/__init__.py @@ -0,0 +1,5 @@ +"""American Eagle datasets.""" + +from .products import AmericanEagleProducts + +__all__ = ["AmericanEagleProducts"] diff --git a/src/brightdata/datasets/american_eagle/products.py b/src/brightdata/datasets/american_eagle/products.py new file mode 100644 index 0000000..5469158 --- /dev/null +++ b/src/brightdata/datasets/american_eagle/products.py @@ -0,0 +1,25 @@ +""" +American Eagle Products dataset. + +Fashion product listings from American Eagle (AE.com). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AmericanEagleProducts(BaseDataset): + """American Eagle Products dataset.""" + + DATASET_ID = "gd_le6plu065keypwyir" + NAME = "american_eagle_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/ashley_furniture/__init__.py b/src/brightdata/datasets/ashley_furniture/__init__.py new file mode 100644 index 0000000..513a7f9 --- /dev/null +++ b/src/brightdata/datasets/ashley_furniture/__init__.py @@ -0,0 +1,5 @@ +"""Ashley Furniture datasets.""" + +from .products import AshleyFurnitureProducts + +__all__ = ["AshleyFurnitureProducts"] diff --git a/src/brightdata/datasets/ashley_furniture/products.py b/src/brightdata/datasets/ashley_furniture/products.py new file mode 100644 index 0000000..df41b74 --- /dev/null +++ b/src/brightdata/datasets/ashley_furniture/products.py @@ -0,0 +1,25 @@ +""" +Ashley Furniture Products dataset. + +Furniture product listings from Ashley Furniture. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AshleyFurnitureProducts(BaseDataset): + """Ashley Furniture Products dataset.""" + + DATASET_ID = "gd_le1ddqrs16uevi5vc4" + NAME = "ashley_furniture_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/asos/__init__.py b/src/brightdata/datasets/asos/__init__.py new file mode 100644 index 0000000..9df9e50 --- /dev/null +++ b/src/brightdata/datasets/asos/__init__.py @@ -0,0 +1,5 @@ +"""Asos datasets.""" + +from .products import AsosProducts + +__all__ = ["AsosProducts"] diff --git a/src/brightdata/datasets/asos/products.py b/src/brightdata/datasets/asos/products.py new file mode 100644 index 0000000..a8e2959 --- /dev/null +++ b/src/brightdata/datasets/asos/products.py @@ -0,0 +1,25 @@ +""" +Asos Products dataset. + +Fashion product listings from Asos. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AsosProducts(BaseDataset): + """Asos Products dataset.""" + + DATASET_ID = "gd_ldbg7we91cp53nr2z4" + NAME = "asos_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/balenciaga/__init__.py b/src/brightdata/datasets/balenciaga/__init__.py new file mode 100644 index 0000000..7f9604e --- /dev/null +++ b/src/brightdata/datasets/balenciaga/__init__.py @@ -0,0 +1,5 @@ +"""Balenciaga datasets.""" + +from .products import BalenciagaProducts + +__all__ = ["BalenciagaProducts"] diff --git a/src/brightdata/datasets/balenciaga/products.py b/src/brightdata/datasets/balenciaga/products.py new file mode 100644 index 0000000..2c0a6c1 --- /dev/null +++ b/src/brightdata/datasets/balenciaga/products.py @@ -0,0 +1,25 @@ +""" +Balenciaga Products dataset. + +Luxury product listings from Balenciaga. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BalenciagaProducts(BaseDataset): + """Balenciaga Products dataset.""" + + DATASET_ID = "gd_lh7oemkb2f9h596dfn" + NAME = "balenciaga_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/berluti/__init__.py b/src/brightdata/datasets/berluti/__init__.py new file mode 100644 index 0000000..47aa4a3 --- /dev/null +++ b/src/brightdata/datasets/berluti/__init__.py @@ -0,0 +1,5 @@ +"""Berluti datasets.""" + +from .products import BerlutiProducts + +__all__ = ["BerlutiProducts"] diff --git a/src/brightdata/datasets/berluti/products.py b/src/brightdata/datasets/berluti/products.py new file mode 100644 index 0000000..4eb746e --- /dev/null +++ b/src/brightdata/datasets/berluti/products.py @@ -0,0 +1,25 @@ +""" +Berluti Products dataset. + +Luxury product listings from Berluti. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BerlutiProducts(BaseDataset): + """Berluti Products dataset.""" + + DATASET_ID = "gd_lh7sef5p16tcupyuy3" + NAME = "berluti_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/bottegaveneta/__init__.py b/src/brightdata/datasets/bottegaveneta/__init__.py new file mode 100644 index 0000000..050bfd9 --- /dev/null +++ b/src/brightdata/datasets/bottegaveneta/__init__.py @@ -0,0 +1,5 @@ +"""Bottega Veneta datasets.""" + +from .products import BottegaVenetaProducts + +__all__ = ["BottegaVenetaProducts"] diff --git a/src/brightdata/datasets/bottegaveneta/products.py b/src/brightdata/datasets/bottegaveneta/products.py new file mode 100644 index 0000000..d234c92 --- /dev/null +++ b/src/brightdata/datasets/bottegaveneta/products.py @@ -0,0 +1,25 @@ +""" +Bottega Veneta Products dataset. + +Luxury product listings from Bottega Veneta. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class BottegaVenetaProducts(BaseDataset): + """Bottega Veneta Products dataset.""" + + DATASET_ID = "gd_lh7os5q91y20h69xj" + NAME = "bottegaveneta_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/carters/__init__.py b/src/brightdata/datasets/carters/__init__.py new file mode 100644 index 0000000..6e32dc0 --- /dev/null +++ b/src/brightdata/datasets/carters/__init__.py @@ -0,0 +1,5 @@ +"""Carters datasets.""" + +from .products import CartersProducts + +__all__ = ["CartersProducts"] diff --git a/src/brightdata/datasets/carters/products.py b/src/brightdata/datasets/carters/products.py new file mode 100644 index 0000000..5a38bde --- /dev/null +++ b/src/brightdata/datasets/carters/products.py @@ -0,0 +1,25 @@ +""" +Carters Products dataset. + +Children's clothing and product listings from Carters. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CartersProducts(BaseDataset): + """Carters Products dataset.""" + + DATASET_ID = "gd_le60f5v0dj17xgv6u" + NAME = "carters_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/celine/__init__.py b/src/brightdata/datasets/celine/__init__.py new file mode 100644 index 0000000..9c30c1f --- /dev/null +++ b/src/brightdata/datasets/celine/__init__.py @@ -0,0 +1,5 @@ +"""Celine datasets.""" + +from .products import CelineProducts + +__all__ = ["CelineProducts"] diff --git a/src/brightdata/datasets/celine/products.py b/src/brightdata/datasets/celine/products.py new file mode 100644 index 0000000..aef3ca6 --- /dev/null +++ b/src/brightdata/datasets/celine/products.py @@ -0,0 +1,25 @@ +""" +Celine Products dataset. + +Luxury product listings from Celine. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CelineProducts(BaseDataset): + """Celine Products dataset.""" + + DATASET_ID = "gd_lh7qnf8rwtn2c2uuc" + NAME = "celine_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/chanel/__init__.py b/src/brightdata/datasets/chanel/__init__.py new file mode 100644 index 0000000..0425ce6 --- /dev/null +++ b/src/brightdata/datasets/chanel/__init__.py @@ -0,0 +1,5 @@ +"""Chanel datasets.""" + +from .products import ChanelProducts + +__all__ = ["ChanelProducts"] diff --git a/src/brightdata/datasets/chanel/products.py b/src/brightdata/datasets/chanel/products.py new file mode 100644 index 0000000..83c2a09 --- /dev/null +++ b/src/brightdata/datasets/chanel/products.py @@ -0,0 +1,25 @@ +""" +Chanel Products dataset. + +Luxury product listings from Chanel. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ChanelProducts(BaseDataset): + """Chanel Products dataset.""" + + DATASET_ID = "gd_ldwwuwqe1oh3zav3js" + NAME = "chanel_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/chileautos/__init__.py b/src/brightdata/datasets/chileautos/__init__.py new file mode 100644 index 0000000..08ec01b --- /dev/null +++ b/src/brightdata/datasets/chileautos/__init__.py @@ -0,0 +1,5 @@ +"""Chileautos datasets.""" + +from .cars import ChileautosChile + +__all__ = ["ChileautosChile"] diff --git a/src/brightdata/datasets/chileautos/cars.py b/src/brightdata/datasets/chileautos/cars.py new file mode 100644 index 0000000..c8c6d36 --- /dev/null +++ b/src/brightdata/datasets/chileautos/cars.py @@ -0,0 +1,25 @@ +""" +Chileautos Chile dataset. + +Car listings from Chileautos (Chile). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ChileautosChile(BaseDataset): + """Chileautos Chile car listings dataset.""" + + DATASET_ID = "gd_lfsbqgb01iiit5ppju" + NAME = "chileautos_chile" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/client.py b/src/brightdata/datasets/client.py index 70081c2..1e03a77 100644 --- a/src/brightdata/datasets/client.py +++ b/src/brightdata/datasets/client.py @@ -5,13 +5,92 @@ from typing import List, Optional, TYPE_CHECKING from .models import DatasetInfo -from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles -from .amazon import AmazonProducts +from .linkedin import LinkedInPeopleProfiles, LinkedInCompanyProfiles, LinkedInJobListings +from .amazon import AmazonProducts, AmazonReviews, AmazonSellersInfo from .crunchbase import CrunchbaseCompanies from .imdb import IMDBMovies from .nba import NBAPlayersStats from .goodreads import GoodreadsBooks from .world_population import WorldPopulation +from .companies_enriched import CompaniesEnriched +from .employees_enriched import EmployeesEnriched +from .glassdoor import GlassdoorCompanies, GlassdoorReviews, GlassdoorJobs +from .google_maps import GoogleMapsReviews +from .yelp import YelpBusinesses, YelpReviews +from .zoominfo import ZoomInfoCompanies +from .pitchbook import PitchBookCompanies +from .g2 import G2Products, G2Reviews +from .trustpilot import TrustpilotReviews +from .indeed import IndeedCompanies, IndeedJobs +from .xing import XingProfiles +from .slintel import SlintelCompanies +from .owler import OwlerCompanies +from .lawyers import USLawyers +from .manta import MantaBusinesses +from .ventureradar import VentureRadarCompanies +from .trustradius import TrustRadiusReviews +from .instagram import InstagramProfiles, InstagramPosts +from .tiktok import TikTokProfiles +from .real_estate import AustraliaRealEstate +from .walmart import WalmartProducts +from .mediamarkt import MediamarktProducts +from .fendi import FendiProducts +from .zalando import ZalandoProducts +from .sephora import SephoraProducts +from .zara import ZaraProducts, ZaraHomeProducts +from .mango import MangoProducts +from .massimo_dutti import MassimoDuttiProducts +from .otodom import OtodomPoland +from .webmotors import WebmotorsBrasil +from .airbnb import AirbnbProperties +from .asos import AsosProducts +from .chanel import ChanelProducts +from .ashley_furniture import AshleyFurnitureProducts +from .fanatics import FanaticsProducts +from .carters import CartersProducts +from .american_eagle import AmericanEagleProducts +from .ikea import IkeaProducts +from .hm import HMProducts +from .lego import LegoProducts +from .mattressfirm import MattressfirmProducts +from .crateandbarrel import CrateAndBarrelProducts +from .llbean import LLBeanProducts +from .shein import SheinProducts +from .toysrus import ToysRUsProducts +from .mybobs import MybobsProducts +from .sleepnumber import SleepNumberProducts +from .raymourflanigan import RaymourFlaniganProducts +from .inmuebles24 import Inmuebles24Mexico +from .mouser import MouserProducts +from .zillow import ZillowProperties +from .zonaprop import ZonapropArgentina +from .metrocuadrado import MetrocuadradoProperties +from .chileautos import ChileautosChile +from .infocasas import InfocasasUruguay +from .lazboy import LaZBoyProducts +from .properati import ProperatiProperties +from .yapo import YapoChile +from .toctoc import ToctocProperties +from .dior import DiorProducts +from .balenciaga import BalenciagaProducts +from .bottegaveneta import BottegaVenetaProducts +from .olx import OLXBrazil +from .celine import CelineProducts +from .loewe import LoeweProducts +from .berluti import BerlutiProducts +from .moynat import MoynatProducts +from .hermes import HermesProducts +from .delvaux import DelvauxProducts +from .prada import PradaProducts +from .montblanc import MontblancProducts +from .ysl import YSLProducts +from .world_zipcodes import WorldZipcodes +from .pinterest import PinterestPosts, PinterestProfiles +from .shopee import ShopeeProducts +from .lazada import LazadaProducts +from .youtube import YouTubeProfiles, YouTubeVideos, YouTubeComments +from .digikey import DigikeyProducts +from .facebook import FacebookPagesPosts if TYPE_CHECKING: from ..core.async_engine import AsyncEngine @@ -49,12 +128,104 @@ def __init__(self, engine: "AsyncEngine"): # Lazy-loaded dataset instances self._linkedin_profiles: Optional[LinkedInPeopleProfiles] = None self._linkedin_companies: Optional[LinkedInCompanyProfiles] = None + self._linkedin_job_listings: Optional[LinkedInJobListings] = None self._amazon_products: Optional[AmazonProducts] = None + self._amazon_reviews: Optional[AmazonReviews] = None self._crunchbase_companies: Optional[CrunchbaseCompanies] = None self._imdb_movies: Optional[IMDBMovies] = None self._nba_players_stats: Optional[NBAPlayersStats] = None self._goodreads_books: Optional[GoodreadsBooks] = None self._world_population: Optional[WorldPopulation] = None + self._companies_enriched: Optional[CompaniesEnriched] = None + self._employees_enriched: Optional[EmployeesEnriched] = None + self._glassdoor_companies: Optional[GlassdoorCompanies] = None + self._glassdoor_reviews: Optional[GlassdoorReviews] = None + self._glassdoor_jobs: Optional[GlassdoorJobs] = None + self._google_maps_reviews: Optional[GoogleMapsReviews] = None + self._yelp_businesses: Optional[YelpBusinesses] = None + self._yelp_reviews: Optional[YelpReviews] = None + self._zoominfo_companies: Optional[ZoomInfoCompanies] = None + self._pitchbook_companies: Optional[PitchBookCompanies] = None + self._g2_products: Optional[G2Products] = None + self._g2_reviews: Optional[G2Reviews] = None + self._trustpilot_reviews: Optional[TrustpilotReviews] = None + self._indeed_companies: Optional[IndeedCompanies] = None + self._xing_profiles: Optional[XingProfiles] = None + self._slintel_companies: Optional[SlintelCompanies] = None + self._owler_companies: Optional[OwlerCompanies] = None + self._us_lawyers: Optional[USLawyers] = None + self._manta_businesses: Optional[MantaBusinesses] = None + self._ventureradar_companies: Optional[VentureRadarCompanies] = None + self._trustradius_reviews: Optional[TrustRadiusReviews] = None + self._instagram_profiles: Optional[InstagramProfiles] = None + self._tiktok_profiles: Optional[TikTokProfiles] = None + self._australia_real_estate: Optional[AustraliaRealEstate] = None + self._indeed_jobs: Optional[IndeedJobs] = None + self._walmart_products: Optional[WalmartProducts] = None + self._mediamarkt_products: Optional[MediamarktProducts] = None + self._fendi_products: Optional[FendiProducts] = None + self._zalando_products: Optional[ZalandoProducts] = None + self._sephora_products: Optional[SephoraProducts] = None + self._zara_products: Optional[ZaraProducts] = None + self._zara_home_products: Optional[ZaraHomeProducts] = None + self._mango_products: Optional[MangoProducts] = None + self._massimo_dutti_products: Optional[MassimoDuttiProducts] = None + self._otodom_poland: Optional[OtodomPoland] = None + self._webmotors_brasil: Optional[WebmotorsBrasil] = None + self._airbnb_properties: Optional[AirbnbProperties] = None + self._asos_products: Optional[AsosProducts] = None + self._chanel_products: Optional[ChanelProducts] = None + self._ashley_furniture_products: Optional[AshleyFurnitureProducts] = None + self._fanatics_products: Optional[FanaticsProducts] = None + self._carters_products: Optional[CartersProducts] = None + self._american_eagle_products: Optional[AmericanEagleProducts] = None + self._ikea_products: Optional[IkeaProducts] = None + self._hm_products: Optional[HMProducts] = None + self._lego_products: Optional[LegoProducts] = None + self._mattressfirm_products: Optional[MattressfirmProducts] = None + self._crateandbarrel_products: Optional[CrateAndBarrelProducts] = None + self._llbean_products: Optional[LLBeanProducts] = None + self._shein_products: Optional[SheinProducts] = None + self._toysrus_products: Optional[ToysRUsProducts] = None + self._mybobs_products: Optional[MybobsProducts] = None + self._sleepnumber_products: Optional[SleepNumberProducts] = None + self._raymourflanigan_products: Optional[RaymourFlaniganProducts] = None + self._inmuebles24_mexico: Optional[Inmuebles24Mexico] = None + self._mouser_products: Optional[MouserProducts] = None + self._zillow_properties: Optional[ZillowProperties] = None + self._zonaprop_argentina: Optional[ZonapropArgentina] = None + self._metrocuadrado_properties: Optional[MetrocuadradoProperties] = None + self._chileautos_chile: Optional[ChileautosChile] = None + self._infocasas_uruguay: Optional[InfocasasUruguay] = None + self._lazboy_products: Optional[LaZBoyProducts] = None + self._properati_properties: Optional[ProperatiProperties] = None + self._yapo_chile: Optional[YapoChile] = None + self._toctoc_properties: Optional[ToctocProperties] = None + self._dior_products: Optional[DiorProducts] = None + self._balenciaga_products: Optional[BalenciagaProducts] = None + self._bottegaveneta_products: Optional[BottegaVenetaProducts] = None + self._olx_brazil: Optional[OLXBrazil] = None + self._celine_products: Optional[CelineProducts] = None + self._loewe_products: Optional[LoeweProducts] = None + self._berluti_products: Optional[BerlutiProducts] = None + self._moynat_products: Optional[MoynatProducts] = None + self._hermes_products: Optional[HermesProducts] = None + self._delvaux_products: Optional[DelvauxProducts] = None + self._prada_products: Optional[PradaProducts] = None + self._montblanc_products: Optional[MontblancProducts] = None + self._ysl_products: Optional[YSLProducts] = None + self._amazon_sellers_info: Optional[AmazonSellersInfo] = None + self._world_zipcodes: Optional[WorldZipcodes] = None + self._pinterest_posts: Optional[PinterestPosts] = None + self._pinterest_profiles: Optional[PinterestProfiles] = None + self._shopee_products: Optional[ShopeeProducts] = None + self._lazada_products: Optional[LazadaProducts] = None + self._instagram_posts: Optional[InstagramPosts] = None + self._youtube_profiles: Optional[YouTubeProfiles] = None + self._youtube_videos: Optional[YouTubeVideos] = None + self._youtube_comments: Optional[YouTubeComments] = None + self._digikey_products: Optional[DigikeyProducts] = None + self._facebook_pages_posts: Optional[FacebookPagesPosts] = None async def list(self) -> List[DatasetInfo]: """ @@ -93,6 +264,13 @@ def linkedin_companies(self) -> LinkedInCompanyProfiles: self._linkedin_companies = LinkedInCompanyProfiles(self._engine) return self._linkedin_companies + @property + def linkedin_job_listings(self) -> LinkedInJobListings: + """LinkedIn Profiles Jobs Listings dataset.""" + if self._linkedin_job_listings is None: + self._linkedin_job_listings = LinkedInJobListings(self._engine) + return self._linkedin_job_listings + @property def amazon_products(self) -> AmazonProducts: """Amazon Products dataset.""" @@ -100,6 +278,13 @@ def amazon_products(self) -> AmazonProducts: self._amazon_products = AmazonProducts(self._engine) return self._amazon_products + @property + def amazon_reviews(self) -> AmazonReviews: + """Amazon Reviews dataset.""" + if self._amazon_reviews is None: + self._amazon_reviews = AmazonReviews(self._engine) + return self._amazon_reviews + @property def crunchbase_companies(self) -> CrunchbaseCompanies: """Crunchbase Companies dataset (2.3M+ records).""" @@ -134,3 +319,633 @@ def world_population(self) -> WorldPopulation: if self._world_population is None: self._world_population = WorldPopulation(self._engine) return self._world_population + + @property + def companies_enriched(self) -> CompaniesEnriched: + """Companies Enriched dataset - multi-source company information.""" + if self._companies_enriched is None: + self._companies_enriched = CompaniesEnriched(self._engine) + return self._companies_enriched + + @property + def employees_enriched(self) -> EmployeesEnriched: + """Employees Business Enriched dataset - LinkedIn profiles with company data.""" + if self._employees_enriched is None: + self._employees_enriched = EmployeesEnriched(self._engine) + return self._employees_enriched + + @property + def glassdoor_companies(self) -> GlassdoorCompanies: + """Glassdoor Companies Overview dataset - ratings, reviews, and company details.""" + if self._glassdoor_companies is None: + self._glassdoor_companies = GlassdoorCompanies(self._engine) + return self._glassdoor_companies + + @property + def glassdoor_reviews(self) -> GlassdoorReviews: + """Glassdoor Companies Reviews dataset - employee reviews and ratings.""" + if self._glassdoor_reviews is None: + self._glassdoor_reviews = GlassdoorReviews(self._engine) + return self._glassdoor_reviews + + @property + def glassdoor_jobs(self) -> GlassdoorJobs: + """Glassdoor Job Listings dataset - job postings with company data.""" + if self._glassdoor_jobs is None: + self._glassdoor_jobs = GlassdoorJobs(self._engine) + return self._glassdoor_jobs + + @property + def google_maps_reviews(self) -> GoogleMapsReviews: + """Google Maps Reviews dataset - place reviews and ratings.""" + if self._google_maps_reviews is None: + self._google_maps_reviews = GoogleMapsReviews(self._engine) + return self._google_maps_reviews + + @property + def yelp_businesses(self) -> YelpBusinesses: + """Yelp Businesses Overview dataset - business listings and ratings.""" + if self._yelp_businesses is None: + self._yelp_businesses = YelpBusinesses(self._engine) + return self._yelp_businesses + + @property + def yelp_reviews(self) -> YelpReviews: + """Yelp Business Reviews dataset - individual business reviews.""" + if self._yelp_reviews is None: + self._yelp_reviews = YelpReviews(self._engine) + return self._yelp_reviews + + @property + def zoominfo_companies(self) -> ZoomInfoCompanies: + """ZoomInfo Companies dataset - company data with financials and contacts.""" + if self._zoominfo_companies is None: + self._zoominfo_companies = ZoomInfoCompanies(self._engine) + return self._zoominfo_companies + + @property + def pitchbook_companies(self) -> PitchBookCompanies: + """PitchBook Companies dataset - PE/VC company data with deals.""" + if self._pitchbook_companies is None: + self._pitchbook_companies = PitchBookCompanies(self._engine) + return self._pitchbook_companies + + @property + def g2_products(self) -> G2Products: + """G2 Software Product Overview dataset - software ratings and reviews.""" + if self._g2_products is None: + self._g2_products = G2Products(self._engine) + return self._g2_products + + @property + def g2_reviews(self) -> G2Reviews: + """G2 Software Product Reviews dataset - individual product reviews.""" + if self._g2_reviews is None: + self._g2_reviews = G2Reviews(self._engine) + return self._g2_reviews + + @property + def trustpilot_reviews(self) -> TrustpilotReviews: + """Trustpilot Business Reviews dataset - company reviews and ratings.""" + if self._trustpilot_reviews is None: + self._trustpilot_reviews = TrustpilotReviews(self._engine) + return self._trustpilot_reviews + + @property + def indeed_companies(self) -> IndeedCompanies: + """Indeed Companies Info dataset - company profiles with jobs and reviews.""" + if self._indeed_companies is None: + self._indeed_companies = IndeedCompanies(self._engine) + return self._indeed_companies + + @property + def xing_profiles(self) -> XingProfiles: + """Xing Social Network Profiles dataset - professional profiles.""" + if self._xing_profiles is None: + self._xing_profiles = XingProfiles(self._engine) + return self._xing_profiles + + @property + def slintel_companies(self) -> SlintelCompanies: + """Slintel 6sense Company Information dataset - technographics and company data.""" + if self._slintel_companies is None: + self._slintel_companies = SlintelCompanies(self._engine) + return self._slintel_companies + + @property + def owler_companies(self) -> OwlerCompanies: + """Owler Companies Information dataset - competitive intelligence and metrics.""" + if self._owler_companies is None: + self._owler_companies = OwlerCompanies(self._engine) + return self._owler_companies + + @property + def us_lawyers(self) -> USLawyers: + """US Lawyers Directory dataset - lawyer profiles and practice areas.""" + if self._us_lawyers is None: + self._us_lawyers = USLawyers(self._engine) + return self._us_lawyers + + @property + def manta_businesses(self) -> MantaBusinesses: + """Manta Businesses dataset - business listings with revenue and employees.""" + if self._manta_businesses is None: + self._manta_businesses = MantaBusinesses(self._engine) + return self._manta_businesses + + @property + def ventureradar_companies(self) -> VentureRadarCompanies: + """VentureRadar Company Information dataset - startup intelligence.""" + if self._ventureradar_companies is None: + self._ventureradar_companies = VentureRadarCompanies(self._engine) + return self._ventureradar_companies + + @property + def trustradius_reviews(self) -> TrustRadiusReviews: + """TrustRadius Product Reviews dataset - software product reviews.""" + if self._trustradius_reviews is None: + self._trustradius_reviews = TrustRadiusReviews(self._engine) + return self._trustradius_reviews + + @property + def instagram_profiles(self) -> InstagramProfiles: + """Instagram Profiles dataset - user profiles and engagement.""" + if self._instagram_profiles is None: + self._instagram_profiles = InstagramProfiles(self._engine) + return self._instagram_profiles + + @property + def tiktok_profiles(self) -> TikTokProfiles: + """TikTok Profiles dataset - user profiles and engagement.""" + if self._tiktok_profiles is None: + self._tiktok_profiles = TikTokProfiles(self._engine) + return self._tiktok_profiles + + @property + def australia_real_estate(self) -> AustraliaRealEstate: + """Australia Real Estate Properties dataset.""" + if self._australia_real_estate is None: + self._australia_real_estate = AustraliaRealEstate(self._engine) + return self._australia_real_estate + + @property + def indeed_jobs(self) -> IndeedJobs: + """Indeed Job Listings dataset.""" + if self._indeed_jobs is None: + self._indeed_jobs = IndeedJobs(self._engine) + return self._indeed_jobs + + @property + def walmart_products(self) -> WalmartProducts: + """Walmart Products dataset.""" + if self._walmart_products is None: + self._walmart_products = WalmartProducts(self._engine) + return self._walmart_products + + @property + def mediamarkt_products(self) -> MediamarktProducts: + """Mediamarkt.de Products dataset.""" + if self._mediamarkt_products is None: + self._mediamarkt_products = MediamarktProducts(self._engine) + return self._mediamarkt_products + + @property + def fendi_products(self) -> FendiProducts: + """Fendi Products dataset.""" + if self._fendi_products is None: + self._fendi_products = FendiProducts(self._engine) + return self._fendi_products + + @property + def zalando_products(self) -> ZalandoProducts: + """Zalando Products dataset.""" + if self._zalando_products is None: + self._zalando_products = ZalandoProducts(self._engine) + return self._zalando_products + + @property + def sephora_products(self) -> SephoraProducts: + """Sephora Products dataset.""" + if self._sephora_products is None: + self._sephora_products = SephoraProducts(self._engine) + return self._sephora_products + + @property + def zara_products(self) -> ZaraProducts: + """Zara Products dataset.""" + if self._zara_products is None: + self._zara_products = ZaraProducts(self._engine) + return self._zara_products + + @property + def zara_home_products(self) -> ZaraHomeProducts: + """Zara Home Products dataset.""" + if self._zara_home_products is None: + self._zara_home_products = ZaraHomeProducts(self._engine) + return self._zara_home_products + + @property + def mango_products(self) -> MangoProducts: + """Mango Products dataset.""" + if self._mango_products is None: + self._mango_products = MangoProducts(self._engine) + return self._mango_products + + @property + def massimo_dutti_products(self) -> MassimoDuttiProducts: + """Massimo Dutti Products dataset.""" + if self._massimo_dutti_products is None: + self._massimo_dutti_products = MassimoDuttiProducts(self._engine) + return self._massimo_dutti_products + + @property + def otodom_poland(self) -> OtodomPoland: + """Otodom Poland real estate dataset.""" + if self._otodom_poland is None: + self._otodom_poland = OtodomPoland(self._engine) + return self._otodom_poland + + @property + def webmotors_brasil(self) -> WebmotorsBrasil: + """Webmotors Brasil vehicle listings dataset.""" + if self._webmotors_brasil is None: + self._webmotors_brasil = WebmotorsBrasil(self._engine) + return self._webmotors_brasil + + @property + def airbnb_properties(self) -> AirbnbProperties: + """Airbnb Properties dataset.""" + if self._airbnb_properties is None: + self._airbnb_properties = AirbnbProperties(self._engine) + return self._airbnb_properties + + @property + def asos_products(self) -> AsosProducts: + """Asos Products dataset.""" + if self._asos_products is None: + self._asos_products = AsosProducts(self._engine) + return self._asos_products + + @property + def chanel_products(self) -> ChanelProducts: + """Chanel Products dataset.""" + if self._chanel_products is None: + self._chanel_products = ChanelProducts(self._engine) + return self._chanel_products + + @property + def ashley_furniture_products(self) -> AshleyFurnitureProducts: + """Ashley Furniture Products dataset.""" + if self._ashley_furniture_products is None: + self._ashley_furniture_products = AshleyFurnitureProducts(self._engine) + return self._ashley_furniture_products + + @property + def fanatics_products(self) -> FanaticsProducts: + """Fanatics Products dataset.""" + if self._fanatics_products is None: + self._fanatics_products = FanaticsProducts(self._engine) + return self._fanatics_products + + @property + def carters_products(self) -> CartersProducts: + """Carters Products dataset.""" + if self._carters_products is None: + self._carters_products = CartersProducts(self._engine) + return self._carters_products + + @property + def american_eagle_products(self) -> AmericanEagleProducts: + """American Eagle Products dataset.""" + if self._american_eagle_products is None: + self._american_eagle_products = AmericanEagleProducts(self._engine) + return self._american_eagle_products + + @property + def ikea_products(self) -> IkeaProducts: + """Ikea Products dataset.""" + if self._ikea_products is None: + self._ikea_products = IkeaProducts(self._engine) + return self._ikea_products + + @property + def hm_products(self) -> HMProducts: + """H&M Products dataset.""" + if self._hm_products is None: + self._hm_products = HMProducts(self._engine) + return self._hm_products + + @property + def lego_products(self) -> LegoProducts: + """Lego Products dataset.""" + if self._lego_products is None: + self._lego_products = LegoProducts(self._engine) + return self._lego_products + + @property + def mattressfirm_products(self) -> MattressfirmProducts: + """Mattressfirm Products dataset.""" + if self._mattressfirm_products is None: + self._mattressfirm_products = MattressfirmProducts(self._engine) + return self._mattressfirm_products + + @property + def crateandbarrel_products(self) -> CrateAndBarrelProducts: + """Crate and Barrel Products dataset.""" + if self._crateandbarrel_products is None: + self._crateandbarrel_products = CrateAndBarrelProducts(self._engine) + return self._crateandbarrel_products + + @property + def llbean_products(self) -> LLBeanProducts: + """L.L. Bean Products dataset.""" + if self._llbean_products is None: + self._llbean_products = LLBeanProducts(self._engine) + return self._llbean_products + + @property + def shein_products(self) -> SheinProducts: + """Shein Products dataset.""" + if self._shein_products is None: + self._shein_products = SheinProducts(self._engine) + return self._shein_products + + @property + def toysrus_products(self) -> ToysRUsProducts: + """Toys R Us Products dataset.""" + if self._toysrus_products is None: + self._toysrus_products = ToysRUsProducts(self._engine) + return self._toysrus_products + + @property + def mybobs_products(self) -> MybobsProducts: + """Mybobs Products dataset.""" + if self._mybobs_products is None: + self._mybobs_products = MybobsProducts(self._engine) + return self._mybobs_products + + @property + def sleepnumber_products(self) -> SleepNumberProducts: + """Sleep Number Products dataset.""" + if self._sleepnumber_products is None: + self._sleepnumber_products = SleepNumberProducts(self._engine) + return self._sleepnumber_products + + @property + def raymourflanigan_products(self) -> RaymourFlaniganProducts: + """Raymour and Flanigan Products dataset.""" + if self._raymourflanigan_products is None: + self._raymourflanigan_products = RaymourFlaniganProducts(self._engine) + return self._raymourflanigan_products + + @property + def inmuebles24_mexico(self) -> Inmuebles24Mexico: + """Inmuebles24 Mexico real estate dataset.""" + if self._inmuebles24_mexico is None: + self._inmuebles24_mexico = Inmuebles24Mexico(self._engine) + return self._inmuebles24_mexico + + @property + def mouser_products(self) -> MouserProducts: + """Mouser Products dataset.""" + if self._mouser_products is None: + self._mouser_products = MouserProducts(self._engine) + return self._mouser_products + + @property + def zillow_properties(self) -> ZillowProperties: + """Zillow Properties dataset.""" + if self._zillow_properties is None: + self._zillow_properties = ZillowProperties(self._engine) + return self._zillow_properties + + @property + def zonaprop_argentina(self) -> ZonapropArgentina: + """Zonaprop Argentina real estate dataset.""" + if self._zonaprop_argentina is None: + self._zonaprop_argentina = ZonapropArgentina(self._engine) + return self._zonaprop_argentina + + @property + def metrocuadrado_properties(self) -> MetrocuadradoProperties: + """Metrocuadrado Properties dataset.""" + if self._metrocuadrado_properties is None: + self._metrocuadrado_properties = MetrocuadradoProperties(self._engine) + return self._metrocuadrado_properties + + @property + def chileautos_chile(self) -> ChileautosChile: + """Chileautos Chile car listings dataset.""" + if self._chileautos_chile is None: + self._chileautos_chile = ChileautosChile(self._engine) + return self._chileautos_chile + + @property + def infocasas_uruguay(self) -> InfocasasUruguay: + """Infocasas Uruguay real estate dataset.""" + if self._infocasas_uruguay is None: + self._infocasas_uruguay = InfocasasUruguay(self._engine) + return self._infocasas_uruguay + + @property + def lazboy_products(self) -> LaZBoyProducts: + """La-Z-Boy Products dataset.""" + if self._lazboy_products is None: + self._lazboy_products = LaZBoyProducts(self._engine) + return self._lazboy_products + + @property + def properati_properties(self) -> ProperatiProperties: + """Properati Properties dataset.""" + if self._properati_properties is None: + self._properati_properties = ProperatiProperties(self._engine) + return self._properati_properties + + @property + def yapo_chile(self) -> YapoChile: + """Yapo Chile marketplace ads dataset.""" + if self._yapo_chile is None: + self._yapo_chile = YapoChile(self._engine) + return self._yapo_chile + + @property + def toctoc_properties(self) -> ToctocProperties: + """Toctoc Properties dataset.""" + if self._toctoc_properties is None: + self._toctoc_properties = ToctocProperties(self._engine) + return self._toctoc_properties + + @property + def dior_products(self) -> DiorProducts: + """Dior Products dataset.""" + if self._dior_products is None: + self._dior_products = DiorProducts(self._engine) + return self._dior_products + + @property + def balenciaga_products(self) -> BalenciagaProducts: + """Balenciaga Products dataset.""" + if self._balenciaga_products is None: + self._balenciaga_products = BalenciagaProducts(self._engine) + return self._balenciaga_products + + @property + def bottegaveneta_products(self) -> BottegaVenetaProducts: + """Bottega Veneta Products dataset.""" + if self._bottegaveneta_products is None: + self._bottegaveneta_products = BottegaVenetaProducts(self._engine) + return self._bottegaveneta_products + + @property + def olx_brazil(self) -> OLXBrazil: + """OLX Brazil marketplace ads dataset.""" + if self._olx_brazil is None: + self._olx_brazil = OLXBrazil(self._engine) + return self._olx_brazil + + @property + def celine_products(self) -> CelineProducts: + """Celine Products dataset.""" + if self._celine_products is None: + self._celine_products = CelineProducts(self._engine) + return self._celine_products + + @property + def loewe_products(self) -> LoeweProducts: + """Loewe Products dataset.""" + if self._loewe_products is None: + self._loewe_products = LoeweProducts(self._engine) + return self._loewe_products + + @property + def berluti_products(self) -> BerlutiProducts: + """Berluti Products dataset.""" + if self._berluti_products is None: + self._berluti_products = BerlutiProducts(self._engine) + return self._berluti_products + + @property + def moynat_products(self) -> MoynatProducts: + """Moynat Products dataset.""" + if self._moynat_products is None: + self._moynat_products = MoynatProducts(self._engine) + return self._moynat_products + + @property + def hermes_products(self) -> HermesProducts: + """Hermes Products dataset.""" + if self._hermes_products is None: + self._hermes_products = HermesProducts(self._engine) + return self._hermes_products + + @property + def delvaux_products(self) -> DelvauxProducts: + """Delvaux Products dataset.""" + if self._delvaux_products is None: + self._delvaux_products = DelvauxProducts(self._engine) + return self._delvaux_products + + @property + def prada_products(self) -> PradaProducts: + """Prada Products dataset.""" + if self._prada_products is None: + self._prada_products = PradaProducts(self._engine) + return self._prada_products + + @property + def montblanc_products(self) -> MontblancProducts: + """Montblanc Products dataset.""" + if self._montblanc_products is None: + self._montblanc_products = MontblancProducts(self._engine) + return self._montblanc_products + + @property + def ysl_products(self) -> YSLProducts: + """YSL Products dataset.""" + if self._ysl_products is None: + self._ysl_products = YSLProducts(self._engine) + return self._ysl_products + + @property + def amazon_sellers_info(self) -> AmazonSellersInfo: + """Amazon Sellers Info dataset.""" + if self._amazon_sellers_info is None: + self._amazon_sellers_info = AmazonSellersInfo(self._engine) + return self._amazon_sellers_info + + @property + def world_zipcodes(self) -> WorldZipcodes: + """World Zipcodes dataset.""" + if self._world_zipcodes is None: + self._world_zipcodes = WorldZipcodes(self._engine) + return self._world_zipcodes + + @property + def pinterest_posts(self) -> PinterestPosts: + """Pinterest Posts dataset.""" + if self._pinterest_posts is None: + self._pinterest_posts = PinterestPosts(self._engine) + return self._pinterest_posts + + @property + def pinterest_profiles(self) -> PinterestProfiles: + """Pinterest Profiles dataset.""" + if self._pinterest_profiles is None: + self._pinterest_profiles = PinterestProfiles(self._engine) + return self._pinterest_profiles + + @property + def shopee_products(self) -> ShopeeProducts: + """Shopee Products dataset.""" + if self._shopee_products is None: + self._shopee_products = ShopeeProducts(self._engine) + return self._shopee_products + + @property + def lazada_products(self) -> LazadaProducts: + """Lazada Products dataset.""" + if self._lazada_products is None: + self._lazada_products = LazadaProducts(self._engine) + return self._lazada_products + + @property + def instagram_posts(self) -> InstagramPosts: + """Instagram Posts dataset.""" + if self._instagram_posts is None: + self._instagram_posts = InstagramPosts(self._engine) + return self._instagram_posts + + @property + def youtube_profiles(self) -> YouTubeProfiles: + """YouTube Profiles dataset.""" + if self._youtube_profiles is None: + self._youtube_profiles = YouTubeProfiles(self._engine) + return self._youtube_profiles + + @property + def youtube_videos(self) -> YouTubeVideos: + """YouTube Videos dataset.""" + if self._youtube_videos is None: + self._youtube_videos = YouTubeVideos(self._engine) + return self._youtube_videos + + @property + def youtube_comments(self) -> YouTubeComments: + """YouTube Comments dataset.""" + if self._youtube_comments is None: + self._youtube_comments = YouTubeComments(self._engine) + return self._youtube_comments + + @property + def digikey_products(self) -> DigikeyProducts: + """Digikey Products dataset.""" + if self._digikey_products is None: + self._digikey_products = DigikeyProducts(self._engine) + return self._digikey_products + + @property + def facebook_pages_posts(self) -> FacebookPagesPosts: + """Facebook Pages Posts dataset.""" + if self._facebook_pages_posts is None: + self._facebook_pages_posts = FacebookPagesPosts(self._engine) + return self._facebook_pages_posts diff --git a/src/brightdata/datasets/companies_enriched/__init__.py b/src/brightdata/datasets/companies_enriched/__init__.py new file mode 100644 index 0000000..3094c72 --- /dev/null +++ b/src/brightdata/datasets/companies_enriched/__init__.py @@ -0,0 +1,5 @@ +"""Companies Enriched dataset - multi-source company information.""" + +from .companies import CompaniesEnriched + +__all__ = ["CompaniesEnriched"] diff --git a/src/brightdata/datasets/companies_enriched/companies.py b/src/brightdata/datasets/companies_enriched/companies.py new file mode 100644 index 0000000..a775b27 --- /dev/null +++ b/src/brightdata/datasets/companies_enriched/companies.py @@ -0,0 +1,307 @@ +""" +Companies Enriched dataset. + +Multi-source company information combining data from: +- LinkedIn (_lc) +- Slintel/6sense (_sl) +- Owler (_ow) +- Crunchbase (_cb) +- Indeed (_in) +- ZoomInfo (_zi) +- Glassdoor (_gd) + +Use get_metadata() to discover all 336+ available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Data source suffixes +SOURCES = { + "_lc": "linkedin", + "_sl": "slintel", + "_ow": "owler", + "_cb": "crunchbase", + "_in": "indeed", + "_zi": "zoominfo", + "_gd": "glassdoor", +} + + +class CompaniesEnriched(BaseDataset): + """ + Companies Enriched dataset. + + Aggregates company information from 7+ data sources into a single dataset. + Each field is suffixed with its source (e.g., `name_lc` for LinkedIn, + `revenue_cb` for Crunchbase). + + Data Sources: + - LinkedIn (_lc): Company profiles, followers, employees + - Slintel (_sl): Tech stack, company news + - Owler (_ow): Revenue, funding, competitors + - Crunchbase (_cb): Funding rounds, investors, IPO status + - Indeed (_in): Job listings, reviews, salaries + - ZoomInfo (_zi): Contacts, org charts, tech stack + - Glassdoor (_gd): Ratings, reviews, salaries + + Example: + >>> companies = client.datasets.companies_enriched + >>> # Discover available fields + >>> metadata = await companies.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Get fields by source + >>> linkedin_fields = await companies.get_fields_by_source("linkedin") + >>> crunchbase_fields = await companies.get_fields_by_source("crunchbase") + >>> + >>> # Filter companies + >>> snapshot_id = await companies( + ... filter={"name": "industries_lc", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await companies.download(snapshot_id) + """ + + # TODO: Replace with actual dataset ID + DATASET_ID = "gd_lxxxxxxxxxxxxxx" # Get from Bright Data console + NAME = "companies_enriched" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_source: Optional[Dict[str, List[str]]] = None + + async def get_fields_by_source(self, source: str, include_inactive: bool = False) -> List[str]: + """ + Get field names from a specific data source. + + Args: + source: Source name - one of: linkedin, slintel, owler, + crunchbase, indeed, zoominfo, glassdoor + include_inactive: Include inactive fields (default: False) + + Returns: + List of field names from that source + + Example: + >>> linkedin_fields = await companies.get_fields_by_source("linkedin") + >>> # ['url_lc', 'name_lc', 'followers_lc', ...] + """ + metadata = await self.get_metadata() + suffix = self._get_suffix_for_source(source) + + if suffix is None: + raise ValueError( + f"Unknown source: {source}. " f"Valid sources: {list(SOURCES.values())}" + ) + + fields = [] + for name, field_info in metadata.fields.items(): + if name.endswith(suffix): + if include_inactive or field_info.active: + fields.append(name) + + return sorted(fields) + + async def get_all_sources(self) -> Dict[str, List[str]]: + """ + Get all fields grouped by data source. + + Returns: + Dict mapping source name to list of field names + + Example: + >>> sources = await companies.get_all_sources() + >>> for source, fields in sources.items(): + ... print(f"{source}: {len(fields)} fields") + """ + if self._fields_by_source is not None: + return self._fields_by_source + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = {source: [] for source in SOURCES.values()} + result["other"] = [] # Fields without recognized suffix + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + source_found = False + for suffix, source_name in SOURCES.items(): + if name.endswith(suffix): + result[source_name].append(name) + source_found = True + break + + if not source_found: + result["other"].append(name) + + # Sort each list + for source in result: + result[source] = sorted(result[source]) + + self._fields_by_source = result + return result + + async def get_common_fields(self) -> Dict[str, Dict[str, str]]: + """ + Get common field types across sources. + + Returns: + Dict mapping concept to source fields. + E.g., {"name": {"linkedin": "name_lc", "crunchbase": "name_cb"}} + + Example: + >>> common = await companies.get_common_fields() + >>> # Get name from all sources that have it + >>> name_fields = common.get("name", {}) + """ + # Common field patterns across sources + common_patterns = { + "name": [ + "name_lc", + "name_sl", + "companyName_ow", + "name_cb", + "name_in", + "name_zi", + "company_gd", + ], + "url": ["url_lc", "url_sl", "url_ow", "url_cb", "url_in", "url_zi", "url_gd"], + "website": [ + "website_lc", + "website_sl", + "website_ow", + "website_cb", + "website_in", + "website_zi", + "details_website_gd", + ], + "industry": [ + "industries_lc", + "industries_sl", + "industries_ow", + "industries_cb", + "industry_in", + "industry_zi", + "industry_gd", + ], + "employees": [ + "employees_lc", + "num_employees_sl", + "employeeCount_ow", + "num_employees_cb", + "company_size_in", + "employees_zi", + "details_size_gd", + ], + "revenue": ["revenue_ow", "revenue_zi", "revenue_in", "details_revenue_gd"], + "founded": ["founded_lc", "founded_ow", "founded_date_cb", "details_founded_gd"], + "country": [ + "country_code_lc", + "country_code_sl", + "country_ow", + "country_code_cb", + "country_code_in", + "country_code_gd", + ], + "description": [ + "about_lc", + "about_sl", + "description_ow", + "about_cb", + "description_in", + "description_zi", + ], + "logo": ["logo_lc", "logo_sl", "image_cb", "logo_in", "logo_gd"], + "headquarters": [ + "headquarters_lc", + "location_sl", + "city_ow", + "location_cb", + "headquarters_in", + "headquarters_zi", + "details_headquarters_gd", + ], + } + + metadata = await self.get_metadata() + available_fields = set(metadata.fields.keys()) + + result: Dict[str, Dict[str, str]] = {} + for concept, field_names in common_patterns.items(): + result[concept] = {} + for field_name in field_names: + if field_name in available_fields: + # Extract source from suffix + for suffix, source_name in SOURCES.items(): + if field_name.endswith(suffix): + result[concept][source_name] = field_name + break + + return result + + async def search_fields(self, keyword: str) -> List[str]: + """ + Search for fields containing a keyword. + + Args: + keyword: Keyword to search for (case-insensitive) + + Returns: + List of matching field names + + Example: + >>> funding_fields = await companies.search_fields("funding") + >>> # ['funding_lc', 'totalFunding_ow', 'funding_rounds_cb', ...] + """ + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + def _get_suffix_for_source(self, source: str) -> Optional[str]: + """Get the field suffix for a source name.""" + source_lower = source.lower() + for suffix, name in SOURCES.items(): + if name == source_lower: + return suffix + return None + + @staticmethod + def get_source_for_field(field_name: str) -> Optional[str]: + """ + Get the data source for a field name. + + Args: + field_name: Field name (e.g., "name_lc") + + Returns: + Source name (e.g., "linkedin") or None if not recognized + """ + for suffix, source_name in SOURCES.items(): + if field_name.endswith(suffix): + return source_name + return None + + @classmethod + def list_sources(cls) -> List[str]: + """ + List all available data sources. + + Returns: + List of source names + """ + return list(SOURCES.values()) diff --git a/src/brightdata/datasets/crateandbarrel/__init__.py b/src/brightdata/datasets/crateandbarrel/__init__.py new file mode 100644 index 0000000..54bf0e1 --- /dev/null +++ b/src/brightdata/datasets/crateandbarrel/__init__.py @@ -0,0 +1,5 @@ +"""Crate and Barrel datasets.""" + +from .products import CrateAndBarrelProducts + +__all__ = ["CrateAndBarrelProducts"] diff --git a/src/brightdata/datasets/crateandbarrel/products.py b/src/brightdata/datasets/crateandbarrel/products.py new file mode 100644 index 0000000..4f5ce3a --- /dev/null +++ b/src/brightdata/datasets/crateandbarrel/products.py @@ -0,0 +1,25 @@ +""" +Crate and Barrel Products dataset. + +Home furnishing and decor product listings from Crate and Barrel. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class CrateAndBarrelProducts(BaseDataset): + """Crate and Barrel Products dataset.""" + + DATASET_ID = "gd_lemtcp2p2qdyd24vq5" + NAME = "crateandbarrel_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/delvaux/__init__.py b/src/brightdata/datasets/delvaux/__init__.py new file mode 100644 index 0000000..f3d0001 --- /dev/null +++ b/src/brightdata/datasets/delvaux/__init__.py @@ -0,0 +1,5 @@ +"""Delvaux datasets.""" + +from .products import DelvauxProducts + +__all__ = ["DelvauxProducts"] diff --git a/src/brightdata/datasets/delvaux/products.py b/src/brightdata/datasets/delvaux/products.py new file mode 100644 index 0000000..8a56114 --- /dev/null +++ b/src/brightdata/datasets/delvaux/products.py @@ -0,0 +1,25 @@ +""" +Delvaux Products dataset. + +Luxury product listings from Delvaux. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class DelvauxProducts(BaseDataset): + """Delvaux Products dataset.""" + + DATASET_ID = "gd_lhahvbli142qv9r0v1" + NAME = "delvaux_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/digikey/__init__.py b/src/brightdata/datasets/digikey/__init__.py new file mode 100644 index 0000000..d659e65 --- /dev/null +++ b/src/brightdata/datasets/digikey/__init__.py @@ -0,0 +1,5 @@ +"""Digikey datasets.""" + +from .products import DigikeyProducts + +__all__ = ["DigikeyProducts"] diff --git a/src/brightdata/datasets/digikey/products.py b/src/brightdata/datasets/digikey/products.py new file mode 100644 index 0000000..499dfc3 --- /dev/null +++ b/src/brightdata/datasets/digikey/products.py @@ -0,0 +1,25 @@ +""" +Digikey Products dataset. + +Electronic components from Digikey. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class DigikeyProducts(BaseDataset): + """Digikey Products dataset.""" + + DATASET_ID = "gd_lj74waf72416ro0k65" + NAME = "digikey_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/dior/__init__.py b/src/brightdata/datasets/dior/__init__.py new file mode 100644 index 0000000..d0fc6cc --- /dev/null +++ b/src/brightdata/datasets/dior/__init__.py @@ -0,0 +1,5 @@ +"""Dior datasets.""" + +from .products import DiorProducts + +__all__ = ["DiorProducts"] diff --git a/src/brightdata/datasets/dior/products.py b/src/brightdata/datasets/dior/products.py new file mode 100644 index 0000000..ab05600 --- /dev/null +++ b/src/brightdata/datasets/dior/products.py @@ -0,0 +1,25 @@ +""" +Dior Products dataset. + +Luxury product listings from Dior. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class DiorProducts(BaseDataset): + """Dior Products dataset.""" + + DATASET_ID = "gd_lh7o3kqu6wp7qmqkl" + NAME = "dior_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/employees_enriched/__init__.py b/src/brightdata/datasets/employees_enriched/__init__.py new file mode 100644 index 0000000..cfe86da --- /dev/null +++ b/src/brightdata/datasets/employees_enriched/__init__.py @@ -0,0 +1,5 @@ +"""Employees Business Enriched dataset - LinkedIn profiles with company data.""" + +from .employees import EmployeesEnriched + +__all__ = ["EmployeesEnriched"] diff --git a/src/brightdata/datasets/employees_enriched/employees.py b/src/brightdata/datasets/employees_enriched/employees.py new file mode 100644 index 0000000..c10480a --- /dev/null +++ b/src/brightdata/datasets/employees_enriched/employees.py @@ -0,0 +1,250 @@ +""" +Employees Business Enriched dataset. + +LinkedIn employee profiles enriched with company information. +Contains profile data (education, experience, certifications) alongside +associated company details (revenue, funding, size). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories for organization +PROFILE_FIELDS = [ + "url", + "profile_url", + "linkedin_num_id", + "avatar", + "profile_name", + "certifications", + "profile_location", + "profile_connections", + "profile_country_code", + "profile_education_full", + "profile_last_education", + "profile_experience_full", + "profile_last_experience", + "profile_followers", + "profile_linkedin_id", + "profile_current_position", + "profile_current_title", + "profile_activity", + "profile_posts", + "profile_about", + "profile_courses", + "profile_volunteer_experience", + "profile_languages", + "profile_publications", + "profile_recommendations", + "profile_recommendations_count", + "profile_organizations", + "profile_projects", + "profile_bio_links", +] + +COMPANY_FIELDS = [ + "company_name", + "company_id", + "company_linkedin_url", + "company_size", + "company_country_code", + "company_description", + "company_other_employees", + "employees_in_linkedin", + "company_linkedin_followers", + "company_locations", + "company_founded_year", + "company_headquarters", + "company_categories", + "company_logo", + "company_slogan", + "company_specialties", + "company_updates", + "company_website", + "company_type", + "company_clean_domain", + "company_revenue_usd", + "company_total_funding", + "company_total_employees", + "company_stock_symbol", + "company_is_non_profit", + "company_parent_company", +] + + +class EmployeesEnriched(BaseDataset): + """ + Employees Business Enriched dataset. + + LinkedIn employee profiles enriched with their associated company + information. Each record contains detailed profile data alongside + company metrics. + + Field Categories: + - Profile: Personal info, education, experience, certifications + - Company: Associated company details, revenue, funding, size + + Example: + >>> employees = client.datasets.employees_enriched + >>> # Discover available fields + >>> metadata = await employees.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Get fields by category + >>> profile_fields = employees.get_profile_fields() + >>> company_fields = employees.get_company_fields() + >>> + >>> # Filter employees + >>> snapshot_id = await employees( + ... filter={"name": "profile_country_code", "operator": "=", "value": "US"}, + ... records_limit=100 + ... ) + >>> data = await employees.download(snapshot_id) + """ + + # TODO: Replace with actual dataset ID + DATASET_ID = "gd_lxxxxxxxxxxxxxx" # Get from Bright Data console + NAME = "employees_enriched" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_profile_fields() -> List[str]: + """ + Get all profile-related field names. + + Returns: + List of profile field names + + Example: + >>> profile_fields = employees.get_profile_fields() + >>> # ['url', 'profile_url', 'profile_name', ...] + """ + return PROFILE_FIELDS.copy() + + @staticmethod + def get_company_fields() -> List[str]: + """ + Get all company-related field names. + + Returns: + List of company field names + + Example: + >>> company_fields = employees.get_company_fields() + >>> # ['company_name', 'company_id', 'company_revenue_usd', ...] + """ + return COMPANY_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """ + Get all fields grouped by category. + + Returns: + Dict mapping category name to list of field names + + Example: + >>> categories = await employees.get_fields_by_category() + >>> for category, fields in categories.items(): + ... print(f"{category}: {len(fields)} fields") + """ + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "profile": [], + "company": [], + "other": [], + } + + profile_set = set(PROFILE_FIELDS) + company_set = set(COMPANY_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in profile_set or name.startswith("profile_"): + result["profile"].append(name) + elif name in company_set or name.startswith("company_"): + result["company"].append(name) + else: + result["other"].append(name) + + # Sort each list + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """ + Search for fields containing a keyword. + + Args: + keyword: Keyword to search for (case-insensitive) + + Returns: + List of matching field names + + Example: + >>> education_fields = await employees.search_fields("education") + >>> # ['profile_education_full', 'profile_last_education'] + """ + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + async def get_experience_fields(self) -> List[str]: + """ + Get fields related to work experience. + + Returns: + List of experience-related field names + """ + return await self.search_fields("experience") + + async def get_education_fields(self) -> List[str]: + """ + Get fields related to education. + + Returns: + List of education-related field names + """ + return await self.search_fields("education") + + @staticmethod + def get_identifier_fields() -> List[str]: + """ + Get fields that can be used as unique identifiers. + + Returns: + List of identifier field names + """ + return [ + "url", + "profile_url", + "linkedin_num_id", + "profile_linkedin_id", + "company_id", + "company_linkedin_url", + ] diff --git a/src/brightdata/datasets/facebook/__init__.py b/src/brightdata/datasets/facebook/__init__.py new file mode 100644 index 0000000..0d943ef --- /dev/null +++ b/src/brightdata/datasets/facebook/__init__.py @@ -0,0 +1,5 @@ +"""Facebook datasets.""" + +from .pages_posts import FacebookPagesPosts + +__all__ = ["FacebookPagesPosts"] diff --git a/src/brightdata/datasets/facebook/pages_posts.py b/src/brightdata/datasets/facebook/pages_posts.py new file mode 100644 index 0000000..bccc3b6 --- /dev/null +++ b/src/brightdata/datasets/facebook/pages_posts.py @@ -0,0 +1,25 @@ +""" +Facebook Pages Posts dataset. + +Posts from Facebook Pages by profile URL. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FacebookPagesPosts(BaseDataset): + """Facebook Pages Posts dataset.""" + + DATASET_ID = "gd_lkaxegm826bjpoo9m5" + NAME = "facebook_pages_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/fanatics/__init__.py b/src/brightdata/datasets/fanatics/__init__.py new file mode 100644 index 0000000..823e911 --- /dev/null +++ b/src/brightdata/datasets/fanatics/__init__.py @@ -0,0 +1,5 @@ +"""Fanatics datasets.""" + +from .products import FanaticsProducts + +__all__ = ["FanaticsProducts"] diff --git a/src/brightdata/datasets/fanatics/products.py b/src/brightdata/datasets/fanatics/products.py new file mode 100644 index 0000000..28faff2 --- /dev/null +++ b/src/brightdata/datasets/fanatics/products.py @@ -0,0 +1,25 @@ +""" +Fanatics Products dataset. + +Sports merchandise and product listings from Fanatics. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FanaticsProducts(BaseDataset): + """Fanatics Products dataset.""" + + DATASET_ID = "gd_le124kuq1uoj7zj8hb" + NAME = "fanatics_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/fendi/__init__.py b/src/brightdata/datasets/fendi/__init__.py new file mode 100644 index 0000000..21ce86f --- /dev/null +++ b/src/brightdata/datasets/fendi/__init__.py @@ -0,0 +1,5 @@ +"""Fendi datasets.""" + +from .products import FendiProducts + +__all__ = ["FendiProducts"] diff --git a/src/brightdata/datasets/fendi/products.py b/src/brightdata/datasets/fendi/products.py new file mode 100644 index 0000000..d8238cd --- /dev/null +++ b/src/brightdata/datasets/fendi/products.py @@ -0,0 +1,25 @@ +""" +Fendi Products dataset. + +Luxury product listings from Fendi. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class FendiProducts(BaseDataset): + """Fendi Products dataset.""" + + DATASET_ID = "gd_lbqsfpfk71ubir3pi" + NAME = "fendi_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/g2/__init__.py b/src/brightdata/datasets/g2/__init__.py new file mode 100644 index 0000000..45a3121 --- /dev/null +++ b/src/brightdata/datasets/g2/__init__.py @@ -0,0 +1,6 @@ +"""G2 datasets.""" + +from .products import G2Products +from .reviews import G2Reviews + +__all__ = ["G2Products", "G2Reviews"] diff --git a/src/brightdata/datasets/g2/products.py b/src/brightdata/datasets/g2/products.py new file mode 100644 index 0000000..e3ca936 --- /dev/null +++ b/src/brightdata/datasets/g2/products.py @@ -0,0 +1,216 @@ +""" +G2 Software Product Overview dataset. + +Software product listings from G2 with ratings, reviews, pricing, +and competitive analysis. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +PRODUCT_FIELDS = [ + "url", + "product_name", + "product_id", + "product_url", + "software_product_id", + "numerical_id", + "logo", + "description", + "overview", + "what_is_description", +] + +SELLER_FIELDS = [ + "seller", + "ownership", + "seller_website", + "headquarters", + "seller_description", + "year_founded", + "overview_provided_by", +] + +RATING_FIELDS = [ + "rating", + "reviews_count", + "rating_split", + "pros_list", + "cons_list", + "reviews", + "highest_rated_features", +] + +COMPETITIVE_FIELDS = [ + "competitors", + "position_against_competitors", + "top_alternatives", + "top_alternatives_url", +] + +CONTENT_FIELDS = [ + "pricing", + "full_pricing_page", + "official_screenshots", + "official_downloads", + "official_videos", + "Features", +] + +CATEGORY_FIELDS = [ + "categories", + "main_category", + "main_subject", + "languages_supported", + "badge", + "claimed", + "region", + "country_code", +] + + +class G2Products(BaseDataset): + """ + G2 Software Product Overview dataset. + + Software product listings with ratings, reviews, pricing, + features, and competitive positioning from G2. + + Field Categories: + - Product: Name, ID, description, overview + - Seller: Company info, website, headquarters + - Rating: Scores, reviews, pros/cons + - Competitive: Competitors, alternatives, positioning + - Content: Pricing, screenshots, videos, features + - Category: Categories, languages, region + + Example: + >>> g2 = client.datasets.g2_products + >>> # Discover available fields + >>> metadata = await g2.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by rating + >>> snapshot_id = await g2( + ... filter={"name": "rating", "operator": ">=", "value": "4.5"}, + ... records_limit=100 + ... ) + >>> data = await g2.download(snapshot_id) + """ + + DATASET_ID = "gd_l88xp4k01qnhvyqlvw" + NAME = "g2_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_product_fields() -> List[str]: + """Get product identity field names.""" + return PRODUCT_FIELDS.copy() + + @staticmethod + def get_seller_fields() -> List[str]: + """Get seller/vendor field names.""" + return SELLER_FIELDS.copy() + + @staticmethod + def get_rating_fields() -> List[str]: + """Get rating and review field names.""" + return RATING_FIELDS.copy() + + @staticmethod + def get_competitive_fields() -> List[str]: + """Get competitive analysis field names.""" + return COMPETITIVE_FIELDS.copy() + + @staticmethod + def get_content_fields() -> List[str]: + """Get content field names (pricing, media).""" + return CONTENT_FIELDS.copy() + + @staticmethod + def get_category_fields() -> List[str]: + """Get category and classification field names.""" + return CATEGORY_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "product": [], + "seller": [], + "rating": [], + "competitive": [], + "content": [], + "category": [], + "other": [], + } + + product_set = set(PRODUCT_FIELDS) + seller_set = set(SELLER_FIELDS) + rating_set = set(RATING_FIELDS) + competitive_set = set(COMPETITIVE_FIELDS) + content_set = set(CONTENT_FIELDS) + category_set = set(CATEGORY_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in product_set or name.startswith("product_"): + result["product"].append(name) + elif name in seller_set or name.startswith("seller"): + result["seller"].append(name) + elif name in rating_set or "rating" in name or "review" in name: + result["rating"].append(name) + elif name in competitive_set or "competitor" in name or "alternative" in name: + result["competitive"].append(name) + elif name in content_set or "pricing" in name: + result["content"].append(name) + elif name in category_set or "category" in name: + result["category"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "product_id", + "software_product_id", + "numerical_id", + "url", + ] diff --git a/src/brightdata/datasets/g2/reviews.py b/src/brightdata/datasets/g2/reviews.py new file mode 100644 index 0000000..de8456a --- /dev/null +++ b/src/brightdata/datasets/g2/reviews.py @@ -0,0 +1,151 @@ +""" +G2 Software Product Reviews dataset. + +Individual product reviews from G2 with author details and ratings. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +REVIEW_FIELDS = [ + "review_id", + "date", + "title", + "text", + "tags", + "stars", + "review_url", +] + +AUTHOR_FIELDS = [ + "author_id", + "author", + "position", + "company_size", +] + +PRODUCT_FIELDS = [ + "url", + "product_url", + "page", + "pages", + "product_name", + "vendor_name", + "sort_filter", +] + + +class G2Reviews(BaseDataset): + """ + G2 Software Product Reviews dataset. + + Individual product reviews with author information, ratings, + and detailed review content. + + Field Categories: + - Review: ID, date, title, text, stars, tags + - Author: ID, name, position, company size + - Product: Name, vendor, URL + + Example: + >>> reviews = client.datasets.g2_reviews + >>> # Discover available fields + >>> metadata = await reviews.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by rating + >>> snapshot_id = await reviews( + ... filter={"name": "stars", "operator": ">=", "value": "4"}, + ... records_limit=100 + ... ) + >>> data = await reviews.download(snapshot_id) + """ + + DATASET_ID = "gd_l88xvdka1uao86xvlb" + NAME = "g2_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_review_fields() -> List[str]: + """Get review content field names.""" + return REVIEW_FIELDS.copy() + + @staticmethod + def get_author_fields() -> List[str]: + """Get author-related field names.""" + return AUTHOR_FIELDS.copy() + + @staticmethod + def get_product_fields() -> List[str]: + """Get product-related field names.""" + return PRODUCT_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "review": [], + "author": [], + "product": [], + "other": [], + } + + review_set = set(REVIEW_FIELDS) + author_set = set(AUTHOR_FIELDS) + product_set = set(PRODUCT_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in review_set or name.startswith("review"): + result["review"].append(name) + elif name in author_set or name.startswith("author"): + result["author"].append(name) + elif name in product_set or name.startswith("product"): + result["product"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "review_id", + "author_id", + "review_url", + ] diff --git a/src/brightdata/datasets/glassdoor/__init__.py b/src/brightdata/datasets/glassdoor/__init__.py new file mode 100644 index 0000000..aadf858 --- /dev/null +++ b/src/brightdata/datasets/glassdoor/__init__.py @@ -0,0 +1,7 @@ +"""Glassdoor datasets.""" + +from .companies import GlassdoorCompanies +from .reviews import GlassdoorReviews +from .jobs import GlassdoorJobs + +__all__ = ["GlassdoorCompanies", "GlassdoorReviews", "GlassdoorJobs"] diff --git a/src/brightdata/datasets/glassdoor/companies.py b/src/brightdata/datasets/glassdoor/companies.py new file mode 100644 index 0000000..25435cf --- /dev/null +++ b/src/brightdata/datasets/glassdoor/companies.py @@ -0,0 +1,301 @@ +""" +Glassdoor Companies Overview dataset. + +Company information from Glassdoor including ratings, reviews, +salary data, and interview insights. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories for organization +DETAILS_FIELDS = [ + "id", + "company", + "country_code", + "region", + "company_type", + "industry", + "details_size", + "details_founded", + "details_type", + "details_headquarters", + "details_industry", + "details_revenue", + "details_website", + "stock_symbol", + "competitors", + "additional_information", +] + +RATINGS_FIELDS = [ + "ratings_overall", + "ratings_career_opportunities", + "ratings_ceo_approval", + "ratings_ceo_approval_count", + "ratings_compensation_benefits", + "ratings_culture_values", + "ratings_senior_management", + "ratings_work_life_balance", + "ratings_business_outlook", + "ratings_recommend_to_friend", + "ratings_rated_ceo", + "diversity_inclusion_score", + "diversity_inclusion_count", + "career_opportunities_distribution", +] + +URL_FIELDS = [ + "url", + "url_overview", + "url_jobs", + "url_reviews", + "url_faq", + "benefits_url", + "salaries_url", + "interviews_url", + "photos_url", +] + +COUNT_FIELDS = [ + "salaries_count", + "interviews_count", + "benefits_count", + "jobs_count", + "photos_count", + "reviews_count", +] + +INTERVIEW_FIELDS = [ + "interview_difficulty", + "interviews_count", + "interviews_experience", + "interviews_url", +] + + +class GlassdoorCompanies(BaseDataset): + """ + Glassdoor Companies Overview dataset. + + Company information from Glassdoor including employee ratings, + CEO approval, salary insights, and interview experiences. + + Field Categories: + - Details: Company info, size, industry, headquarters + - Ratings: Overall rating, work-life balance, culture, compensation + - URLs: Links to company pages on Glassdoor + - Counts: Number of reviews, salaries, interviews, etc. + - Interviews: Interview difficulty and experience data + + Example: + >>> glassdoor = client.datasets.glassdoor_companies + >>> # Discover available fields + >>> metadata = await glassdoor.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Get fields by category + >>> ratings = glassdoor.get_ratings_fields() + >>> details = glassdoor.get_details_fields() + >>> + >>> # Filter companies by rating + >>> snapshot_id = await glassdoor( + ... filter={"name": "ratings_overall", "operator": ">=", "value": "4.0"}, + ... records_limit=100 + ... ) + >>> data = await glassdoor.download(snapshot_id) + """ + + DATASET_ID = "gd_l7j0bx501ockwldaqf" + NAME = "glassdoor_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_details_fields() -> List[str]: + """ + Get company details field names. + + Returns: + List of details field names + + Example: + >>> details = glassdoor.get_details_fields() + >>> # ['id', 'company', 'details_size', 'details_industry', ...] + """ + return DETAILS_FIELDS.copy() + + @staticmethod + def get_ratings_fields() -> List[str]: + """ + Get all ratings-related field names. + + Returns: + List of ratings field names + + Example: + >>> ratings = glassdoor.get_ratings_fields() + >>> # ['ratings_overall', 'ratings_work_life_balance', ...] + """ + return RATINGS_FIELDS.copy() + + @staticmethod + def get_url_fields() -> List[str]: + """ + Get all URL field names. + + Returns: + List of URL field names + + Example: + >>> urls = glassdoor.get_url_fields() + >>> # ['url', 'url_overview', 'url_jobs', ...] + """ + return URL_FIELDS.copy() + + @staticmethod + def get_count_fields() -> List[str]: + """ + Get all count-related field names. + + Returns: + List of count field names + + Example: + >>> counts = glassdoor.get_count_fields() + >>> # ['reviews_count', 'salaries_count', ...] + """ + return COUNT_FIELDS.copy() + + @staticmethod + def get_interview_fields() -> List[str]: + """ + Get interview-related field names. + + Returns: + List of interview field names + + Example: + >>> interviews = glassdoor.get_interview_fields() + >>> # ['interview_difficulty', 'interviews_experience', ...] + """ + return INTERVIEW_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """ + Get all fields grouped by category. + + Returns: + Dict mapping category name to list of field names + + Example: + >>> categories = await glassdoor.get_fields_by_category() + >>> for category, fields in categories.items(): + ... print(f"{category}: {len(fields)} fields") + """ + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "details": [], + "ratings": [], + "urls": [], + "counts": [], + "other": [], + } + + details_set = set(DETAILS_FIELDS) + ratings_set = set(RATINGS_FIELDS) + url_set = set(URL_FIELDS) + count_set = set(COUNT_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in details_set or name.startswith("details_"): + result["details"].append(name) + elif name in ratings_set or name.startswith("ratings_"): + result["ratings"].append(name) + elif name in url_set or name.endswith("_url"): + result["urls"].append(name) + elif name in count_set or name.endswith("_count"): + result["counts"].append(name) + else: + result["other"].append(name) + + # Sort each list + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """ + Search for fields containing a keyword. + + Args: + keyword: Keyword to search for (case-insensitive) + + Returns: + List of matching field names + + Example: + >>> salary_fields = await glassdoor.search_fields("salary") + >>> # ['salaries_url', 'salaries_count'] + """ + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + async def get_diversity_fields(self) -> List[str]: + """ + Get diversity and inclusion related fields. + + Returns: + List of diversity-related field names + """ + return await self.search_fields("diversity") + + async def get_ceo_fields(self) -> List[str]: + """ + Get CEO-related fields. + + Returns: + List of CEO-related field names + """ + return await self.search_fields("ceo") + + @staticmethod + def get_identifier_fields() -> List[str]: + """ + Get fields that can be used as unique identifiers. + + Returns: + List of identifier field names + """ + return [ + "id", + "url", + "url_overview", + "details_website", + ] diff --git a/src/brightdata/datasets/glassdoor/jobs.py b/src/brightdata/datasets/glassdoor/jobs.py new file mode 100644 index 0000000..4270103 --- /dev/null +++ b/src/brightdata/datasets/glassdoor/jobs.py @@ -0,0 +1,179 @@ +""" +Glassdoor Job Listings dataset. + +Job postings from Glassdoor with company info, ratings, and pay data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +JOB_FIELDS = [ + "url", + "job_title", + "job_location", + "job_overview", + "job_posting_id", + "job_application_link", +] + +COMPANY_FIELDS = [ + "company_url_overview", + "company_name", + "company_rating", + "company_id", + "company_headquarters", + "company_founded_year", + "company_industry", + "company_revenue", + "company_size", + "company_type", + "company_sector", + "company_website", + "company_ceo", +] + +RATING_FIELDS = [ + "company_career_opportunities_rating", + "company_comp_and_benefits_rating", + "company_culture_and_values_rating", + "company_senior_management_rating", + "company_work/life_balance_rating", + "company_benefits_rating", + "percentage_that_recommend_company_to_a friend", + "percentage_that_approve_of_ceo", +] + +PAY_FIELDS = [ + "pay_range_glassdoor_est", + "pay_median_glassdoor", + "pay_range_employer_est", + "pay_median_employer", + "pay_range_currency", + "pay_type", +] + + +class GlassdoorJobs(BaseDataset): + """ + Glassdoor Job Listings dataset. + + Job postings with detailed company information, ratings, + salary estimates, and employee reviews. + + Field Categories: + - Job: Title, location, overview, application link + - Company: Name, industry, size, headquarters + - Ratings: Career opportunities, culture, management, etc. + - Pay: Salary ranges from Glassdoor and employer estimates + + Example: + >>> jobs = client.datasets.glassdoor_jobs + >>> # Discover available fields + >>> metadata = await jobs.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by job title + >>> snapshot_id = await jobs( + ... filter={"name": "job_title", "operator": "contains", "value": "Engineer"}, + ... records_limit=100 + ... ) + >>> data = await jobs.download(snapshot_id) + """ + + DATASET_ID = "gd_lpfbbndm1xnopbrcr0" + NAME = "glassdoor_jobs" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_job_fields() -> List[str]: + """Get job-related field names.""" + return JOB_FIELDS.copy() + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_rating_fields() -> List[str]: + """Get rating field names.""" + return RATING_FIELDS.copy() + + @staticmethod + def get_pay_fields() -> List[str]: + """Get pay-related field names.""" + return PAY_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "job": [], + "company": [], + "rating": [], + "pay": [], + "other": [], + } + + job_set = set(JOB_FIELDS) + company_set = set(COMPANY_FIELDS) + rating_set = set(RATING_FIELDS) + pay_set = set(PAY_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in job_set or name.startswith("job_"): + result["job"].append(name) + elif name in company_set or name.startswith("company_"): + result["company"].append(name) + elif name in rating_set or "rating" in name: + result["rating"].append(name) + elif name in pay_set or name.startswith("pay_"): + result["pay"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "job_posting_id", + "url", + "company_id", + ] diff --git a/src/brightdata/datasets/glassdoor/reviews.py b/src/brightdata/datasets/glassdoor/reviews.py new file mode 100644 index 0000000..1d4d613 --- /dev/null +++ b/src/brightdata/datasets/glassdoor/reviews.py @@ -0,0 +1,198 @@ +""" +Glassdoor Companies Reviews dataset. + +Employee reviews with detailed ratings, pros/cons, and employee information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "overview_id", + "company_name", + "glassdoor_employer_id", +] + +REVIEW_FIELDS = [ + "review_id", + "review_url", + "rating_date", + "summary", + "review_pros", + "review_cons", + "review_advice", + "advice_to_management", +] + +EMPLOYEE_FIELDS = [ + "employee_job_end_year", + "employee_length", + "employee_responses", + "employee_status", + "employee_type", + "employee_location", + "employee_job_title", +] + +RATING_FIELDS = [ + "rating_overall", + "rating_culture_values", + "rating_diversity_inclusion", + "rating_work_life", + "rating_compensation_benefits", + "rating_senior_leadership", + "rating_career_opportunities", +] + +FLAG_FIELDS = [ + "flag_covid", + "flag_featured", + "flags_business_outlook", + "flags_ceo_approval", + "flags_recommend_frend", +] + +COUNT_FIELDS = [ + "count_helpful", + "count_unhelpful", +] + + +class GlassdoorReviews(BaseDataset): + """ + Glassdoor Companies Reviews dataset. + + Employee reviews with detailed ratings across multiple dimensions, + pros/cons, advice to management, and employee metadata. + + Field Categories: + - Company: Employer ID and name + - Review: Review text, pros, cons, advice + - Employee: Job title, status, tenure, location + - Ratings: Overall, culture, work-life, compensation, etc. + - Flags: COVID, featured, outlook, CEO approval + + Example: + >>> reviews = client.datasets.glassdoor_reviews + >>> # Discover available fields + >>> metadata = await reviews.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Get rating fields + >>> ratings = reviews.get_rating_fields() + >>> + >>> # Filter by rating + >>> snapshot_id = await reviews( + ... filter={"name": "rating_overall", "operator": ">=", "value": "4"}, + ... records_limit=100 + ... ) + >>> data = await reviews.download(snapshot_id) + """ + + DATASET_ID = "gd_l7j1po0921hbu0ri1z" + NAME = "glassdoor_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_review_fields() -> List[str]: + """Get review content field names.""" + return REVIEW_FIELDS.copy() + + @staticmethod + def get_employee_fields() -> List[str]: + """Get employee-related field names.""" + return EMPLOYEE_FIELDS.copy() + + @staticmethod + def get_rating_fields() -> List[str]: + """Get all rating field names.""" + return RATING_FIELDS.copy() + + @staticmethod + def get_flag_fields() -> List[str]: + """Get flag field names.""" + return FLAG_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "review": [], + "employee": [], + "rating": [], + "flag": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + review_set = set(REVIEW_FIELDS) + employee_set = set(EMPLOYEE_FIELDS) + rating_set = set(RATING_FIELDS) + flag_set = set(FLAG_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set: + result["company"].append(name) + elif name in review_set or name.startswith("review"): + result["review"].append(name) + elif name in employee_set or name.startswith("employee"): + result["employee"].append(name) + elif name in rating_set or name.startswith("rating"): + result["rating"].append(name) + elif name in flag_set or name.startswith("flag"): + result["flag"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "review_id", + "overview_id", + "glassdoor_employer_id", + "review_url", + ] diff --git a/src/brightdata/datasets/google_maps/__init__.py b/src/brightdata/datasets/google_maps/__init__.py new file mode 100644 index 0000000..412883b --- /dev/null +++ b/src/brightdata/datasets/google_maps/__init__.py @@ -0,0 +1,5 @@ +"""Google Maps datasets.""" + +from .reviews import GoogleMapsReviews + +__all__ = ["GoogleMapsReviews"] diff --git a/src/brightdata/datasets/google_maps/reviews.py b/src/brightdata/datasets/google_maps/reviews.py new file mode 100644 index 0000000..db6a0e9 --- /dev/null +++ b/src/brightdata/datasets/google_maps/reviews.py @@ -0,0 +1,198 @@ +""" +Google Maps Reviews dataset. + +Reviews and ratings from Google Maps places including +reviewer information, place details, and owner responses. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +PLACE_FIELDS = [ + "url", + "place_id", + "place_name", + "country", + "address", + "category", + "cid", + "fid_location", + "place_general_rating", + "overall_place_riviews", + "questions_answers", +] + +REVIEWER_FIELDS = [ + "reviewer_name", + "reviews_by_reviewer", + "photos_by_reviewer", + "reviewer_url", + "local_guide", + "profile_pic_url", +] + +REVIEW_FIELDS = [ + "review_id", + "review_rating", + "review", + "review_date", + "number_of_likes", + "response_of_owner", + "response_date", + "photos", + "review_details", +] + + +class GoogleMapsReviews(BaseDataset): + """ + Google Maps Reviews dataset. + + Reviews and ratings from Google Maps places including detailed + reviewer information, place metadata, and business owner responses. + + Field Categories: + - Place: Location info, ratings, address, category + - Reviewer: Name, profile, local guide status + - Review: Rating, text, date, photos, owner response + + Example: + >>> reviews = client.datasets.google_maps_reviews + >>> # Discover available fields + >>> metadata = await reviews.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Get fields by category + >>> place_fields = reviews.get_place_fields() + >>> review_fields = reviews.get_review_fields() + >>> + >>> # Filter reviews + >>> snapshot_id = await reviews( + ... filter={"name": "review_rating", "operator": ">=", "value": "4"}, + ... records_limit=100 + ... ) + >>> data = await reviews.download(snapshot_id) + """ + + DATASET_ID = "gd_luzfs1dn2oa0teb81" + NAME = "google_maps_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_place_fields() -> List[str]: + """ + Get place-related field names. + + Returns: + List of place field names + """ + return PLACE_FIELDS.copy() + + @staticmethod + def get_reviewer_fields() -> List[str]: + """ + Get reviewer-related field names. + + Returns: + List of reviewer field names + """ + return REVIEWER_FIELDS.copy() + + @staticmethod + def get_review_fields() -> List[str]: + """ + Get review-related field names. + + Returns: + List of review field names + """ + return REVIEW_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """ + Get all fields grouped by category. + + Returns: + Dict mapping category name to list of field names + """ + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "place": [], + "reviewer": [], + "review": [], + "other": [], + } + + place_set = set(PLACE_FIELDS) + reviewer_set = set(REVIEWER_FIELDS) + review_set = set(REVIEW_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in place_set or name.startswith("place_"): + result["place"].append(name) + elif name in reviewer_set or name.startswith("reviewer"): + result["reviewer"].append(name) + elif name in review_set or name.startswith("review"): + result["review"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """ + Search for fields containing a keyword. + + Args: + keyword: Keyword to search for (case-insensitive) + + Returns: + List of matching field names + """ + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """ + Get fields that can be used as unique identifiers. + + Returns: + List of identifier field names + """ + return [ + "review_id", + "place_id", + "cid", + "url", + ] diff --git a/src/brightdata/datasets/hermes/__init__.py b/src/brightdata/datasets/hermes/__init__.py new file mode 100644 index 0000000..eaabce4 --- /dev/null +++ b/src/brightdata/datasets/hermes/__init__.py @@ -0,0 +1,5 @@ +"""Hermes datasets.""" + +from .products import HermesProducts + +__all__ = ["HermesProducts"] diff --git a/src/brightdata/datasets/hermes/products.py b/src/brightdata/datasets/hermes/products.py new file mode 100644 index 0000000..9c1444a --- /dev/null +++ b/src/brightdata/datasets/hermes/products.py @@ -0,0 +1,25 @@ +""" +Hermes Products dataset. + +Luxury product listings from Hermes. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class HermesProducts(BaseDataset): + """Hermes Products dataset.""" + + DATASET_ID = "gd_lh7sn8rz1g95zt4lwk" + NAME = "hermes_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/hm/__init__.py b/src/brightdata/datasets/hm/__init__.py new file mode 100644 index 0000000..be0614b --- /dev/null +++ b/src/brightdata/datasets/hm/__init__.py @@ -0,0 +1,5 @@ +"""H&M datasets.""" + +from .products import HMProducts + +__all__ = ["HMProducts"] diff --git a/src/brightdata/datasets/hm/products.py b/src/brightdata/datasets/hm/products.py new file mode 100644 index 0000000..72287da --- /dev/null +++ b/src/brightdata/datasets/hm/products.py @@ -0,0 +1,25 @@ +""" +H&M Products dataset. + +Fashion product listings from H&M. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class HMProducts(BaseDataset): + """H&M Products dataset.""" + + DATASET_ID = "gd_lebec5ir293umvxh5g" + NAME = "hm_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/ikea/__init__.py b/src/brightdata/datasets/ikea/__init__.py new file mode 100644 index 0000000..29aebdb --- /dev/null +++ b/src/brightdata/datasets/ikea/__init__.py @@ -0,0 +1,5 @@ +"""Ikea datasets.""" + +from .products import IkeaProducts + +__all__ = ["IkeaProducts"] diff --git a/src/brightdata/datasets/ikea/products.py b/src/brightdata/datasets/ikea/products.py new file mode 100644 index 0000000..89114e6 --- /dev/null +++ b/src/brightdata/datasets/ikea/products.py @@ -0,0 +1,25 @@ +""" +Ikea Products dataset. + +Furniture and home product listings from Ikea. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class IkeaProducts(BaseDataset): + """Ikea Products dataset.""" + + DATASET_ID = "gd_le2lfu10qrjmrqo60" + NAME = "ikea_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/indeed/__init__.py b/src/brightdata/datasets/indeed/__init__.py new file mode 100644 index 0000000..316c35a --- /dev/null +++ b/src/brightdata/datasets/indeed/__init__.py @@ -0,0 +1,6 @@ +"""Indeed datasets.""" + +from .companies import IndeedCompanies +from .jobs import IndeedJobs + +__all__ = ["IndeedCompanies", "IndeedJobs"] diff --git a/src/brightdata/datasets/indeed/companies.py b/src/brightdata/datasets/indeed/companies.py new file mode 100644 index 0000000..5f9cf29 --- /dev/null +++ b/src/brightdata/datasets/indeed/companies.py @@ -0,0 +1,198 @@ +""" +Indeed Companies Info dataset. + +Company profiles from Indeed with job listings, reviews, +salaries, and company details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "name", + "description", + "url", + "website", + "industry", + "company_size", + "revenue", + "logo", + "headquarters", + "country_code", + "details", + "related_companies", + "company_id", + "overall_rating", +] + +WORK_CULTURE_FIELDS = [ + "work_happiness", + "benefits", +] + +REVIEWS_FIELDS = [ + "reviews", + "reviews_count", + "reviews_url", +] + +SALARIES_FIELDS = [ + "salaries", + "salaries_count", + "salaries_url", +] + +JOBS_FIELDS = [ + "jobs_categories", + "jobs_count", + "jobs_url", +] + +OTHER_CONTENT_FIELDS = [ + "q&a_count", + "q&a_url", + "interviews_count", + "interviews_url", + "photos_count", + "photos_url", +] + + +class IndeedCompanies(BaseDataset): + """ + Indeed Companies Info dataset. + + Company profiles with job listings, employee reviews, + salary data, and company culture information. + + Field Categories: + - Company: Name, description, industry, size, revenue, location + - Work Culture: Work happiness scores, benefits + - Reviews: Review counts and links + - Salaries: Salary information and links + - Jobs: Job categories and listings + - Other Content: Q&A, interviews, photos + + Example: + >>> indeed = client.datasets.indeed_companies + >>> # Discover available fields + >>> metadata = await indeed.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by industry + >>> snapshot_id = await indeed( + ... filter={"name": "industry", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await indeed.download(snapshot_id) + """ + + DATASET_ID = "gd_l7qekxkv2i7ve6hx1s" + NAME = "indeed_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_work_culture_fields() -> List[str]: + """Get work culture field names.""" + return WORK_CULTURE_FIELDS.copy() + + @staticmethod + def get_reviews_fields() -> List[str]: + """Get reviews-related field names.""" + return REVIEWS_FIELDS.copy() + + @staticmethod + def get_salaries_fields() -> List[str]: + """Get salaries-related field names.""" + return SALARIES_FIELDS.copy() + + @staticmethod + def get_jobs_fields() -> List[str]: + """Get jobs-related field names.""" + return JOBS_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "work_culture": [], + "reviews": [], + "salaries": [], + "jobs": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + work_set = set(WORK_CULTURE_FIELDS) + reviews_set = set(REVIEWS_FIELDS) + salaries_set = set(SALARIES_FIELDS) + jobs_set = set(JOBS_FIELDS) + other_content_set = set(OTHER_CONTENT_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set or name.startswith("company"): + result["company"].append(name) + elif name in work_set or "happiness" in name or "benefit" in name: + result["work_culture"].append(name) + elif name in reviews_set or "review" in name.lower(): + result["reviews"].append(name) + elif name in salaries_set or "salar" in name.lower(): + result["salaries"].append(name) + elif name in jobs_set or "job" in name.lower(): + result["jobs"].append(name) + elif name in other_content_set: + result["other"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "company_id", + "url", + "website", + ] diff --git a/src/brightdata/datasets/indeed/jobs.py b/src/brightdata/datasets/indeed/jobs.py new file mode 100644 index 0000000..d27adc9 --- /dev/null +++ b/src/brightdata/datasets/indeed/jobs.py @@ -0,0 +1,39 @@ +""" +Indeed Job Listings dataset. + +Job postings from Indeed with company info, salary, and requirements. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class IndeedJobs(BaseDataset): + """ + Indeed Job Listings dataset. + + Job postings with company information, salary ranges, + requirements, and application details. + + Example: + >>> jobs = client.datasets.indeed_jobs + >>> metadata = await jobs.get_metadata() + >>> snapshot_id = await jobs( + ... filter={"name": "job_title", "operator": "contains", "value": "Engineer"}, + ... records_limit=100 + ... ) + >>> data = await jobs.download(snapshot_id) + """ + + DATASET_ID = "gd_l4dx9j9sscpvs7no2" + NAME = "indeed_jobs" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/infocasas/__init__.py b/src/brightdata/datasets/infocasas/__init__.py new file mode 100644 index 0000000..68e86b5 --- /dev/null +++ b/src/brightdata/datasets/infocasas/__init__.py @@ -0,0 +1,5 @@ +"""Infocasas datasets.""" + +from .properties import InfocasasUruguay + +__all__ = ["InfocasasUruguay"] diff --git a/src/brightdata/datasets/infocasas/properties.py b/src/brightdata/datasets/infocasas/properties.py new file mode 100644 index 0000000..033f995 --- /dev/null +++ b/src/brightdata/datasets/infocasas/properties.py @@ -0,0 +1,25 @@ +""" +Infocasas Uruguay dataset. + +Real estate property listings from Infocasas Uruguay. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class InfocasasUruguay(BaseDataset): + """Infocasas Uruguay real estate dataset.""" + + DATASET_ID = "gd_lftpmbga1jwon80ddh" + NAME = "infocasas_uruguay" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/inmuebles24/__init__.py b/src/brightdata/datasets/inmuebles24/__init__.py new file mode 100644 index 0000000..4612e5e --- /dev/null +++ b/src/brightdata/datasets/inmuebles24/__init__.py @@ -0,0 +1,5 @@ +"""Inmuebles24 datasets.""" + +from .properties import Inmuebles24Mexico + +__all__ = ["Inmuebles24Mexico"] diff --git a/src/brightdata/datasets/inmuebles24/properties.py b/src/brightdata/datasets/inmuebles24/properties.py new file mode 100644 index 0000000..6abdbae --- /dev/null +++ b/src/brightdata/datasets/inmuebles24/properties.py @@ -0,0 +1,25 @@ +""" +Inmuebles24 Mexico dataset. + +Real estate property listings from Inmuebles24 Mexico. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class Inmuebles24Mexico(BaseDataset): + """Inmuebles24 Mexico real estate dataset.""" + + DATASET_ID = "gd_lfsa1vgv183347v45m" + NAME = "inmuebles24_mexico" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/instagram/__init__.py b/src/brightdata/datasets/instagram/__init__.py new file mode 100644 index 0000000..3ba3156 --- /dev/null +++ b/src/brightdata/datasets/instagram/__init__.py @@ -0,0 +1,6 @@ +"""Instagram datasets.""" + +from .profiles import InstagramProfiles +from .posts import InstagramPosts + +__all__ = ["InstagramProfiles", "InstagramPosts"] diff --git a/src/brightdata/datasets/instagram/posts.py b/src/brightdata/datasets/instagram/posts.py new file mode 100644 index 0000000..f32d605 --- /dev/null +++ b/src/brightdata/datasets/instagram/posts.py @@ -0,0 +1,25 @@ +""" +Instagram Posts dataset. + +Posts from Instagram. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class InstagramPosts(BaseDataset): + """Instagram Posts dataset.""" + + DATASET_ID = "gd_lk5ns7kz21pck8jpis" + NAME = "instagram_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/instagram/profiles.py b/src/brightdata/datasets/instagram/profiles.py new file mode 100644 index 0000000..800e79d --- /dev/null +++ b/src/brightdata/datasets/instagram/profiles.py @@ -0,0 +1,39 @@ +""" +Instagram Profiles dataset. + +Instagram user profiles with follower counts, bio, posts, and engagement data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class InstagramProfiles(BaseDataset): + """ + Instagram Profiles dataset. + + User profiles with follower metrics, bio information, + and engagement statistics. + + Example: + >>> profiles = client.datasets.instagram_profiles + >>> metadata = await profiles.get_metadata() + >>> snapshot_id = await profiles( + ... filter={"name": "followers", "operator": ">=", "value": "10000"}, + ... records_limit=100 + ... ) + >>> data = await profiles.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vikfch901nx3by4" + NAME = "instagram_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/lawyers/__init__.py b/src/brightdata/datasets/lawyers/__init__.py new file mode 100644 index 0000000..c5a4714 --- /dev/null +++ b/src/brightdata/datasets/lawyers/__init__.py @@ -0,0 +1,5 @@ +"""Lawyers datasets.""" + +from .us_lawyers import USLawyers + +__all__ = ["USLawyers"] diff --git a/src/brightdata/datasets/lawyers/us_lawyers.py b/src/brightdata/datasets/lawyers/us_lawyers.py new file mode 100644 index 0000000..db9d8fc --- /dev/null +++ b/src/brightdata/datasets/lawyers/us_lawyers.py @@ -0,0 +1,217 @@ +""" +US Lawyers Directory dataset. + +Lawyer profiles from Martindale-Hubbell with practice areas, +education, reviews, and contact information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +PROFILE_FIELDS = [ + "url", + "name", + "isln", + "photo", + "address", + "mailing_address", + "location", + "type", + "filial", + "company", +] + +EDUCATION_FIELDS = [ + "admission", + "law_school_attended", + "university_attended", + "year_of_first_admission", + "year_established", +] + +PRACTICE_FIELDS = [ + "areas_of_practice", + "practice_count", + "office_hours", + "office_size", + "languages", +] + +CONTACT_FIELDS = [ + "fax", + "phone", + "phone_cell", + "phone_telecopier", + "website", + "video_call", +] + +REVIEW_FIELDS = [ + "profile_peer_review_count", + "profile_peer_review_star", + "profile_peer_review_awards", + "profile_peer_review_detail", + "profile_visibility", + "profile_client_recomendation_count", + "profile_client_recomendation_rating", + "profile_client_review_count", + "profile_client_review_detail", + "profile_client_review_list", + "profile_client_review_rating", + "awards", +] + +CONTENT_FIELDS = [ + "biography", + "about", + "birth_information", + "memberships", + "hobbies_interests", + "people", + "clients", + "clients2", + "transactions", + "payment_information", + "state_bar_summary", + "minority_owned", +] + + +class USLawyers(BaseDataset): + """ + US Lawyers Directory dataset. + + Lawyer profiles with practice areas, education background, + peer reviews, client reviews, and contact information. + + Field Categories: + - Profile: Name, photo, address, company, type + - Education: Law school, university, admission dates + - Practice: Areas of practice, languages, office details + - Contact: Phone, fax, website, video call + - Review: Peer reviews, client reviews, ratings, awards + - Content: Biography, memberships, clients + + Example: + >>> lawyers = client.datasets.us_lawyers + >>> # Discover available fields + >>> metadata = await lawyers.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by location + >>> snapshot_id = await lawyers( + ... filter={"name": "location", "operator": "=", "value": "CA"}, + ... records_limit=100 + ... ) + >>> data = await lawyers.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vil5n11okchcbvax" + NAME = "us_lawyers" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_profile_fields() -> List[str]: + """Get profile-related field names.""" + return PROFILE_FIELDS.copy() + + @staticmethod + def get_education_fields() -> List[str]: + """Get education-related field names.""" + return EDUCATION_FIELDS.copy() + + @staticmethod + def get_practice_fields() -> List[str]: + """Get practice-related field names.""" + return PRACTICE_FIELDS.copy() + + @staticmethod + def get_contact_fields() -> List[str]: + """Get contact-related field names.""" + return CONTACT_FIELDS.copy() + + @staticmethod + def get_review_fields() -> List[str]: + """Get review-related field names.""" + return REVIEW_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "profile": [], + "education": [], + "practice": [], + "contact": [], + "review": [], + "content": [], + "other": [], + } + + profile_set = set(PROFILE_FIELDS) + education_set = set(EDUCATION_FIELDS) + practice_set = set(PRACTICE_FIELDS) + contact_set = set(CONTACT_FIELDS) + review_set = set(REVIEW_FIELDS) + content_set = set(CONTENT_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in profile_set: + result["profile"].append(name) + elif name in education_set or "school" in name.lower() or "admission" in name.lower(): + result["education"].append(name) + elif name in practice_set or "practice" in name.lower(): + result["practice"].append(name) + elif name in contact_set or "phone" in name.lower(): + result["contact"].append(name) + elif name in review_set or "review" in name.lower() or "rating" in name.lower(): + result["review"].append(name) + elif name in content_set: + result["content"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "isln", + "url", + ] diff --git a/src/brightdata/datasets/lazada/__init__.py b/src/brightdata/datasets/lazada/__init__.py new file mode 100644 index 0000000..6b26803 --- /dev/null +++ b/src/brightdata/datasets/lazada/__init__.py @@ -0,0 +1,5 @@ +"""Lazada datasets.""" + +from .products import LazadaProducts + +__all__ = ["LazadaProducts"] diff --git a/src/brightdata/datasets/lazada/products.py b/src/brightdata/datasets/lazada/products.py new file mode 100644 index 0000000..b883aa5 --- /dev/null +++ b/src/brightdata/datasets/lazada/products.py @@ -0,0 +1,25 @@ +""" +Lazada Products dataset. + +Product listings from Lazada e-commerce platform. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LazadaProducts(BaseDataset): + """Lazada Products dataset.""" + + DATASET_ID = "gd_lk14r4zxuiw2uxpk6" + NAME = "lazada_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/lazboy/__init__.py b/src/brightdata/datasets/lazboy/__init__.py new file mode 100644 index 0000000..3794c0e --- /dev/null +++ b/src/brightdata/datasets/lazboy/__init__.py @@ -0,0 +1,5 @@ +"""La-Z-Boy datasets.""" + +from .products import LaZBoyProducts + +__all__ = ["LaZBoyProducts"] diff --git a/src/brightdata/datasets/lazboy/products.py b/src/brightdata/datasets/lazboy/products.py new file mode 100644 index 0000000..67026e3 --- /dev/null +++ b/src/brightdata/datasets/lazboy/products.py @@ -0,0 +1,25 @@ +""" +La-Z-Boy Products dataset. + +Furniture product listings from La-Z-Boy. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LaZBoyProducts(BaseDataset): + """La-Z-Boy Products dataset.""" + + DATASET_ID = "gd_lg0nhuxkvxagfannn" + NAME = "lazboy_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/lego/__init__.py b/src/brightdata/datasets/lego/__init__.py new file mode 100644 index 0000000..cc03261 --- /dev/null +++ b/src/brightdata/datasets/lego/__init__.py @@ -0,0 +1,5 @@ +"""Lego datasets.""" + +from .products import LegoProducts + +__all__ = ["LegoProducts"] diff --git a/src/brightdata/datasets/lego/products.py b/src/brightdata/datasets/lego/products.py new file mode 100644 index 0000000..626f69f --- /dev/null +++ b/src/brightdata/datasets/lego/products.py @@ -0,0 +1,25 @@ +""" +Lego Products dataset. + +Product listings from Lego. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LegoProducts(BaseDataset): + """Lego Products dataset.""" + + DATASET_ID = "gd_leenwt162rg85apy87" + NAME = "lego_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/linkedin/__init__.py b/src/brightdata/datasets/linkedin/__init__.py index 7b4eacf..683fa7b 100644 --- a/src/brightdata/datasets/linkedin/__init__.py +++ b/src/brightdata/datasets/linkedin/__init__.py @@ -2,5 +2,6 @@ from .people_profiles import LinkedInPeopleProfiles from .company_profiles import LinkedInCompanyProfiles +from .job_listings import LinkedInJobListings -__all__ = ["LinkedInPeopleProfiles", "LinkedInCompanyProfiles"] +__all__ = ["LinkedInPeopleProfiles", "LinkedInCompanyProfiles", "LinkedInJobListings"] diff --git a/src/brightdata/datasets/linkedin/job_listings.py b/src/brightdata/datasets/linkedin/job_listings.py new file mode 100644 index 0000000..7b01f40 --- /dev/null +++ b/src/brightdata/datasets/linkedin/job_listings.py @@ -0,0 +1,142 @@ +""" +LinkedIn Profiles Jobs Listings dataset. + +LinkedIn profiles with associated job recommendations and listings. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +PROFILE_FIELDS = [ + "url", + "linkedin_id", + "name", + "about", + "position", + "country_code", +] + +CAREER_FIELDS = [ + "experience", + "education", + "current_company", +] + +JOB_FIELDS = [ + "optional_jobs", +] + + +class LinkedInJobListings(BaseDataset): + """ + LinkedIn Profiles Jobs Listings dataset. + + LinkedIn profiles enriched with job recommendations and listings + that match the profile's skills and experience. + + Field Categories: + - Profile: Basic profile info (name, position, country) + - Career: Experience, education, current company + - Jobs: Recommended job listings + + Example: + >>> jobs = client.datasets.linkedin_job_listings + >>> # Discover available fields + >>> metadata = await jobs.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by country + >>> snapshot_id = await jobs( + ... filter={"name": "country_code", "operator": "=", "value": "US"}, + ... records_limit=100 + ... ) + >>> data = await jobs.download(snapshot_id) + """ + + DATASET_ID = "gd_lpfll7v5hcqtkxl6l" + NAME = "linkedin_job_listings" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_profile_fields() -> List[str]: + """Get profile-related field names.""" + return PROFILE_FIELDS.copy() + + @staticmethod + def get_career_fields() -> List[str]: + """Get career-related field names.""" + return CAREER_FIELDS.copy() + + @staticmethod + def get_job_fields() -> List[str]: + """Get job listing field names.""" + return JOB_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "profile": [], + "career": [], + "jobs": [], + "other": [], + } + + profile_set = set(PROFILE_FIELDS) + career_set = set(CAREER_FIELDS) + job_set = set(JOB_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in profile_set: + result["profile"].append(name) + elif name in career_set: + result["career"].append(name) + elif name in job_set or "job" in name.lower(): + result["jobs"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "url", + "linkedin_id", + ] diff --git a/src/brightdata/datasets/llbean/__init__.py b/src/brightdata/datasets/llbean/__init__.py new file mode 100644 index 0000000..e425df2 --- /dev/null +++ b/src/brightdata/datasets/llbean/__init__.py @@ -0,0 +1,5 @@ +"""L.L. Bean datasets.""" + +from .products import LLBeanProducts + +__all__ = ["LLBeanProducts"] diff --git a/src/brightdata/datasets/llbean/products.py b/src/brightdata/datasets/llbean/products.py new file mode 100644 index 0000000..106e507 --- /dev/null +++ b/src/brightdata/datasets/llbean/products.py @@ -0,0 +1,25 @@ +""" +L.L. Bean Products dataset. + +Outdoor and casual product listings from L.L. Bean. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LLBeanProducts(BaseDataset): + """L.L. Bean Products dataset.""" + + DATASET_ID = "gd_lemtwv4s1mglzlzh57" + NAME = "llbean_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/loewe/__init__.py b/src/brightdata/datasets/loewe/__init__.py new file mode 100644 index 0000000..8304d74 --- /dev/null +++ b/src/brightdata/datasets/loewe/__init__.py @@ -0,0 +1,5 @@ +"""Loewe datasets.""" + +from .products import LoeweProducts + +__all__ = ["LoeweProducts"] diff --git a/src/brightdata/datasets/loewe/products.py b/src/brightdata/datasets/loewe/products.py new file mode 100644 index 0000000..d881a19 --- /dev/null +++ b/src/brightdata/datasets/loewe/products.py @@ -0,0 +1,25 @@ +""" +Loewe Products dataset. + +Luxury product listings from Loewe. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class LoeweProducts(BaseDataset): + """Loewe Products dataset.""" + + DATASET_ID = "gd_lh7rkj4wwka9q19t" + NAME = "loewe_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/mango/__init__.py b/src/brightdata/datasets/mango/__init__.py new file mode 100644 index 0000000..810d882 --- /dev/null +++ b/src/brightdata/datasets/mango/__init__.py @@ -0,0 +1,5 @@ +"""Mango datasets.""" + +from .products import MangoProducts + +__all__ = ["MangoProducts"] diff --git a/src/brightdata/datasets/mango/products.py b/src/brightdata/datasets/mango/products.py new file mode 100644 index 0000000..274e96e --- /dev/null +++ b/src/brightdata/datasets/mango/products.py @@ -0,0 +1,25 @@ +""" +Mango Products dataset. + +Fashion product listings from Mango. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MangoProducts(BaseDataset): + """Mango Products dataset.""" + + DATASET_ID = "gd_lcyua5iy1go06own9d" + NAME = "mango_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/manta/__init__.py b/src/brightdata/datasets/manta/__init__.py new file mode 100644 index 0000000..6096ee9 --- /dev/null +++ b/src/brightdata/datasets/manta/__init__.py @@ -0,0 +1,5 @@ +"""Manta datasets.""" + +from .businesses import MantaBusinesses + +__all__ = ["MantaBusinesses"] diff --git a/src/brightdata/datasets/manta/businesses.py b/src/brightdata/datasets/manta/businesses.py new file mode 100644 index 0000000..4c1d171 --- /dev/null +++ b/src/brightdata/datasets/manta/businesses.py @@ -0,0 +1,190 @@ +""" +Manta Businesses dataset. + +Business listings from Manta with company details, +location, contact information, and revenue estimates. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "company_name", + "company_id", + "business_categories", + "sic_code", + "description", + "services", + "owner", + "manta_link", + "url", +] + +LOCATION_FIELDS = [ + "business_state", + "business_city", + "business_country", + "business_zip_code", + "business_street_name", + "latitude", + "longitude", + "location_type", +] + +CONTACT_FIELDS = [ + "phone_number", + "website", + "links", +] + +OPERATING_FIELDS = [ + "opening_hours", + "closing_hours", + "avg_opening_hour", + "avg_closing_hour", + "closed_on", + "year_established", +] + +METRICS_FIELDS = [ + "estimated_annual_revenue", + "num_employees", + "review_count", + "reviews", +] + + +class MantaBusinesses(BaseDataset): + """ + Manta Businesses dataset. + + Business listings with company details, location data, + operating hours, and financial estimates. + + Field Categories: + - Company: Name, categories, SIC code, description, owner + - Location: Address, city, state, country, coordinates + - Contact: Phone, website, links + - Operating: Hours, established date + - Metrics: Revenue, employees, reviews + + Example: + >>> manta = client.datasets.manta_businesses + >>> # Discover available fields + >>> metadata = await manta.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by state + >>> snapshot_id = await manta( + ... filter={"name": "business_state", "operator": "=", "value": "California"}, + ... records_limit=100 + ... ) + >>> data = await manta.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vil1d81g0u8763b2" + NAME = "manta_businesses" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_location_fields() -> List[str]: + """Get location-related field names.""" + return LOCATION_FIELDS.copy() + + @staticmethod + def get_contact_fields() -> List[str]: + """Get contact-related field names.""" + return CONTACT_FIELDS.copy() + + @staticmethod + def get_operating_fields() -> List[str]: + """Get operating hours field names.""" + return OPERATING_FIELDS.copy() + + @staticmethod + def get_metrics_fields() -> List[str]: + """Get business metrics field names.""" + return METRICS_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "location": [], + "contact": [], + "operating": [], + "metrics": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + location_set = set(LOCATION_FIELDS) + contact_set = set(CONTACT_FIELDS) + operating_set = set(OPERATING_FIELDS) + metrics_set = set(METRICS_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set or name.startswith("company"): + result["company"].append(name) + elif name in location_set or name.startswith("business_"): + result["location"].append(name) + elif name in contact_set: + result["contact"].append(name) + elif name in operating_set or "hour" in name.lower(): + result["operating"].append(name) + elif name in metrics_set or "revenue" in name.lower() or "employee" in name.lower(): + result["metrics"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "company_id", + "url", + "manta_link", + ] diff --git a/src/brightdata/datasets/massimo_dutti/__init__.py b/src/brightdata/datasets/massimo_dutti/__init__.py new file mode 100644 index 0000000..adf1c22 --- /dev/null +++ b/src/brightdata/datasets/massimo_dutti/__init__.py @@ -0,0 +1,5 @@ +"""Massimo Dutti datasets.""" + +from .products import MassimoDuttiProducts + +__all__ = ["MassimoDuttiProducts"] diff --git a/src/brightdata/datasets/massimo_dutti/products.py b/src/brightdata/datasets/massimo_dutti/products.py new file mode 100644 index 0000000..5a8d70e --- /dev/null +++ b/src/brightdata/datasets/massimo_dutti/products.py @@ -0,0 +1,25 @@ +""" +Massimo Dutti Products dataset. + +Fashion product listings from Massimo Dutti. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MassimoDuttiProducts(BaseDataset): + """Massimo Dutti Products dataset.""" + + DATASET_ID = "gd_lcxf9r252p7e46ul5b" + NAME = "massimo_dutti_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/mattressfirm/__init__.py b/src/brightdata/datasets/mattressfirm/__init__.py new file mode 100644 index 0000000..73e7e48 --- /dev/null +++ b/src/brightdata/datasets/mattressfirm/__init__.py @@ -0,0 +1,5 @@ +"""Mattressfirm datasets.""" + +from .products import MattressfirmProducts + +__all__ = ["MattressfirmProducts"] diff --git a/src/brightdata/datasets/mattressfirm/products.py b/src/brightdata/datasets/mattressfirm/products.py new file mode 100644 index 0000000..f27f434 --- /dev/null +++ b/src/brightdata/datasets/mattressfirm/products.py @@ -0,0 +1,25 @@ +""" +Mattressfirm Products dataset. + +Mattress and bedding product listings from Mattress Firm. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MattressfirmProducts(BaseDataset): + """Mattressfirm Products dataset.""" + + DATASET_ID = "gd_legw5t6c2bvw9d7e4k" + NAME = "mattressfirm_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/mediamarkt/__init__.py b/src/brightdata/datasets/mediamarkt/__init__.py new file mode 100644 index 0000000..aae0301 --- /dev/null +++ b/src/brightdata/datasets/mediamarkt/__init__.py @@ -0,0 +1,5 @@ +"""Mediamarkt datasets.""" + +from .products import MediamarktProducts + +__all__ = ["MediamarktProducts"] diff --git a/src/brightdata/datasets/mediamarkt/products.py b/src/brightdata/datasets/mediamarkt/products.py new file mode 100644 index 0000000..5c3a151 --- /dev/null +++ b/src/brightdata/datasets/mediamarkt/products.py @@ -0,0 +1,35 @@ +""" +Mediamarkt.de Products dataset. + +Product listings from Mediamarkt Germany with prices and details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MediamarktProducts(BaseDataset): + """ + Mediamarkt.de Products dataset. + + Electronics and appliance products from Mediamarkt Germany. + + Example: + >>> products = client.datasets.mediamarkt_products + >>> metadata = await products.get_metadata() + >>> snapshot_id = await products(records_limit=100) + >>> data = await products.download(snapshot_id) + """ + + DATASET_ID = "gd_lbl2lo6y11m37z3gwq" + NAME = "mediamarkt_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/metrocuadrado/__init__.py b/src/brightdata/datasets/metrocuadrado/__init__.py new file mode 100644 index 0000000..7a32f98 --- /dev/null +++ b/src/brightdata/datasets/metrocuadrado/__init__.py @@ -0,0 +1,5 @@ +"""Metrocuadrado datasets.""" + +from .properties import MetrocuadradoProperties + +__all__ = ["MetrocuadradoProperties"] diff --git a/src/brightdata/datasets/metrocuadrado/properties.py b/src/brightdata/datasets/metrocuadrado/properties.py new file mode 100644 index 0000000..2dd75ad --- /dev/null +++ b/src/brightdata/datasets/metrocuadrado/properties.py @@ -0,0 +1,25 @@ +""" +Metrocuadrado Properties dataset. + +Real estate property listings from Metrocuadrado (Colombia). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MetrocuadradoProperties(BaseDataset): + """Metrocuadrado Properties dataset.""" + + DATASET_ID = "gd_lfsblgpf2oq16yrbny" + NAME = "metrocuadrado_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/montblanc/__init__.py b/src/brightdata/datasets/montblanc/__init__.py new file mode 100644 index 0000000..2cc5e20 --- /dev/null +++ b/src/brightdata/datasets/montblanc/__init__.py @@ -0,0 +1,5 @@ +"""Montblanc datasets.""" + +from .products import MontblancProducts + +__all__ = ["MontblancProducts"] diff --git a/src/brightdata/datasets/montblanc/products.py b/src/brightdata/datasets/montblanc/products.py new file mode 100644 index 0000000..afacc99 --- /dev/null +++ b/src/brightdata/datasets/montblanc/products.py @@ -0,0 +1,25 @@ +""" +Montblanc Products dataset. + +Luxury product listings from Montblanc. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MontblancProducts(BaseDataset): + """Montblanc Products dataset.""" + + DATASET_ID = "gd_lhahz3n9dr6srx4cm" + NAME = "montblanc_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/mouser/__init__.py b/src/brightdata/datasets/mouser/__init__.py new file mode 100644 index 0000000..48fa59d --- /dev/null +++ b/src/brightdata/datasets/mouser/__init__.py @@ -0,0 +1,5 @@ +"""Mouser datasets.""" + +from .products import MouserProducts + +__all__ = ["MouserProducts"] diff --git a/src/brightdata/datasets/mouser/products.py b/src/brightdata/datasets/mouser/products.py new file mode 100644 index 0000000..0a42a60 --- /dev/null +++ b/src/brightdata/datasets/mouser/products.py @@ -0,0 +1,25 @@ +""" +Mouser Products dataset. + +Electronic components product listings from Mouser Electronics. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MouserProducts(BaseDataset): + """Mouser Products dataset.""" + + DATASET_ID = "gd_lfjty8942ogxzhmp8t" + NAME = "mouser_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/moynat/__init__.py b/src/brightdata/datasets/moynat/__init__.py new file mode 100644 index 0000000..cf45697 --- /dev/null +++ b/src/brightdata/datasets/moynat/__init__.py @@ -0,0 +1,5 @@ +"""Moynat datasets.""" + +from .products import MoynatProducts + +__all__ = ["MoynatProducts"] diff --git a/src/brightdata/datasets/moynat/products.py b/src/brightdata/datasets/moynat/products.py new file mode 100644 index 0000000..069dce9 --- /dev/null +++ b/src/brightdata/datasets/moynat/products.py @@ -0,0 +1,25 @@ +""" +Moynat Products dataset. + +Luxury product listings from Moynat. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MoynatProducts(BaseDataset): + """Moynat Products dataset.""" + + DATASET_ID = "gd_lh7rh0d12qkaid87e1" + NAME = "moynat_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/mybobs/__init__.py b/src/brightdata/datasets/mybobs/__init__.py new file mode 100644 index 0000000..ef91e37 --- /dev/null +++ b/src/brightdata/datasets/mybobs/__init__.py @@ -0,0 +1,5 @@ +"""Mybobs datasets.""" + +from .products import MybobsProducts + +__all__ = ["MybobsProducts"] diff --git a/src/brightdata/datasets/mybobs/products.py b/src/brightdata/datasets/mybobs/products.py new file mode 100644 index 0000000..9029dff --- /dev/null +++ b/src/brightdata/datasets/mybobs/products.py @@ -0,0 +1,25 @@ +""" +Mybobs Products dataset. + +Furniture product listings from Bob's Discount Furniture. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class MybobsProducts(BaseDataset): + """Mybobs Products dataset.""" + + DATASET_ID = "gd_lf14k1zw1l3zcxs9m4" + NAME = "mybobs_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/olx/__init__.py b/src/brightdata/datasets/olx/__init__.py new file mode 100644 index 0000000..800d7d0 --- /dev/null +++ b/src/brightdata/datasets/olx/__init__.py @@ -0,0 +1,5 @@ +"""OLX datasets.""" + +from .ads import OLXBrazil + +__all__ = ["OLXBrazil"] diff --git a/src/brightdata/datasets/olx/ads.py b/src/brightdata/datasets/olx/ads.py new file mode 100644 index 0000000..b150502 --- /dev/null +++ b/src/brightdata/datasets/olx/ads.py @@ -0,0 +1,25 @@ +""" +OLX Brazil dataset. + +Marketplace ads from OLX Brazil. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class OLXBrazil(BaseDataset): + """OLX Brazil marketplace ads dataset.""" + + DATASET_ID = "gd_lguvsr0wp4rx7fjfo" + NAME = "olx_brazil" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/otodom/__init__.py b/src/brightdata/datasets/otodom/__init__.py new file mode 100644 index 0000000..7f58c63 --- /dev/null +++ b/src/brightdata/datasets/otodom/__init__.py @@ -0,0 +1,5 @@ +"""Otodom datasets.""" + +from .properties import OtodomPoland + +__all__ = ["OtodomPoland"] diff --git a/src/brightdata/datasets/otodom/properties.py b/src/brightdata/datasets/otodom/properties.py new file mode 100644 index 0000000..a165106 --- /dev/null +++ b/src/brightdata/datasets/otodom/properties.py @@ -0,0 +1,25 @@ +""" +Otodom Poland dataset. + +Real estate listings from Otodom (Poland). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class OtodomPoland(BaseDataset): + """Otodom Poland real estate dataset.""" + + DATASET_ID = "gd_ld739mwou49s5y9ko" + NAME = "otodom_poland" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/owler/__init__.py b/src/brightdata/datasets/owler/__init__.py new file mode 100644 index 0000000..bf50e8f --- /dev/null +++ b/src/brightdata/datasets/owler/__init__.py @@ -0,0 +1,5 @@ +"""Owler datasets.""" + +from .companies import OwlerCompanies + +__all__ = ["OwlerCompanies"] diff --git a/src/brightdata/datasets/owler/companies.py b/src/brightdata/datasets/owler/companies.py new file mode 100644 index 0000000..6422d2b --- /dev/null +++ b/src/brightdata/datasets/owler/companies.py @@ -0,0 +1,232 @@ +""" +Owler Companies Information dataset. + +Company profiles from Owler with competitive intelligence, +funding data, and business metrics. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "companyId", + "companyName", + "shortName", + "ownership", + "industrySectors", + "industries", + "website", + "domainName", + "phoneNumber", + "description", + "founded", + "status", + "teamName", + "url", + "cpLink", + "sicCode", +] + +EMPLOYEE_REVENUE_FIELDS = [ + "employeeCount", + "formattedEmployeeCount", + "employeeRange", + "revenue", + "revenueRange", + "formattedRevenue", + "est_annual_revenue", + "est_employees", + "employeeHistory", + "revenueHistory", + "revenueEmpHistory", +] + +LOCATION_FIELDS = [ + "country", + "city", + "state", + "zipcode", + "street1Address", + "location", + "DMACode", +] + +FUNDING_FIELDS = [ + "totalAcquisitions", + "totalCompetitors", + "totalFundings", + "totalInvestments", + "totalFunding", + "formattedFunding", + "companyAcquisitionInfo", + "companyFundingInfo", + "fundingChartInfo", +] + +LEADERSHIP_FIELDS = [ + "ceoDetail", + "leaderShipDetails", +] + +CONTENT_FIELDS = [ + "summarySection", + "keyHighlights", + "trendingNews", + "newsPageFeed", + "seoTextMap", +] + +RELATED_FIELDS = [ + "cg", + "cgCompaniesCount", + "companies", + "recommendedCompanies", + "trendingCompanies", +] + + +class OwlerCompanies(BaseDataset): + """ + Owler Companies Information dataset. + + Company profiles with competitive intelligence, funding history, + employee and revenue metrics, and leadership information. + + Field Categories: + - Company: Name, ownership, industry, website, status + - Employee/Revenue: Counts, ranges, history + - Location: Address, city, state, country + - Funding: Acquisitions, investments, funding rounds + - Leadership: CEO details, leadership team + - Content: Summary, highlights, news + - Related: Competitors, recommended companies + + Example: + >>> owler = client.datasets.owler_companies + >>> # Discover available fields + >>> metadata = await owler.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by country + >>> snapshot_id = await owler( + ... filter={"name": "country", "operator": "=", "value": "USA"}, + ... records_limit=100 + ... ) + >>> data = await owler.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vilaxi10wutoage7" + NAME = "owler_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_employee_revenue_fields() -> List[str]: + """Get employee and revenue field names.""" + return EMPLOYEE_REVENUE_FIELDS.copy() + + @staticmethod + def get_location_fields() -> List[str]: + """Get location-related field names.""" + return LOCATION_FIELDS.copy() + + @staticmethod + def get_funding_fields() -> List[str]: + """Get funding-related field names.""" + return FUNDING_FIELDS.copy() + + @staticmethod + def get_leadership_fields() -> List[str]: + """Get leadership-related field names.""" + return LEADERSHIP_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "employee_revenue": [], + "location": [], + "funding": [], + "leadership": [], + "content": [], + "related": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + emp_rev_set = set(EMPLOYEE_REVENUE_FIELDS) + location_set = set(LOCATION_FIELDS) + funding_set = set(FUNDING_FIELDS) + leadership_set = set(LEADERSHIP_FIELDS) + content_set = set(CONTENT_FIELDS) + related_set = set(RELATED_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set or name.startswith("company"): + result["company"].append(name) + elif name in emp_rev_set or "employee" in name.lower() or "revenue" in name.lower(): + result["employee_revenue"].append(name) + elif name in location_set: + result["location"].append(name) + elif name in funding_set or "funding" in name.lower() or "acquisition" in name.lower(): + result["funding"].append(name) + elif name in leadership_set or "ceo" in name.lower() or "leader" in name.lower(): + result["leadership"].append(name) + elif name in content_set or "news" in name.lower() or "seo" in name.lower(): + result["content"].append(name) + elif name in related_set or "competitor" in name.lower(): + result["related"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "companyId", + "url", + "cpLink", + "domainName", + ] diff --git a/src/brightdata/datasets/pinterest/__init__.py b/src/brightdata/datasets/pinterest/__init__.py new file mode 100644 index 0000000..b4d1e14 --- /dev/null +++ b/src/brightdata/datasets/pinterest/__init__.py @@ -0,0 +1,6 @@ +"""Pinterest datasets.""" + +from .posts import PinterestPosts +from .profiles import PinterestProfiles + +__all__ = ["PinterestPosts", "PinterestProfiles"] diff --git a/src/brightdata/datasets/pinterest/posts.py b/src/brightdata/datasets/pinterest/posts.py new file mode 100644 index 0000000..66270da --- /dev/null +++ b/src/brightdata/datasets/pinterest/posts.py @@ -0,0 +1,25 @@ +""" +Pinterest Posts dataset. + +Posts/pins from Pinterest. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class PinterestPosts(BaseDataset): + """Pinterest Posts dataset.""" + + DATASET_ID = "gd_lk0sjs4d21kdr7cnlv" + NAME = "pinterest_posts" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/pinterest/profiles.py b/src/brightdata/datasets/pinterest/profiles.py new file mode 100644 index 0000000..b0fad50 --- /dev/null +++ b/src/brightdata/datasets/pinterest/profiles.py @@ -0,0 +1,25 @@ +""" +Pinterest Profiles dataset. + +User profiles from Pinterest. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class PinterestProfiles(BaseDataset): + """Pinterest Profiles dataset.""" + + DATASET_ID = "gd_lk0zv93c2m9qdph46z" + NAME = "pinterest_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/pitchbook/__init__.py b/src/brightdata/datasets/pitchbook/__init__.py new file mode 100644 index 0000000..177b6f2 --- /dev/null +++ b/src/brightdata/datasets/pitchbook/__init__.py @@ -0,0 +1,5 @@ +"""PitchBook datasets.""" + +from .companies import PitchBookCompanies + +__all__ = ["PitchBookCompanies"] diff --git a/src/brightdata/datasets/pitchbook/companies.py b/src/brightdata/datasets/pitchbook/companies.py new file mode 100644 index 0000000..41afd2b --- /dev/null +++ b/src/brightdata/datasets/pitchbook/companies.py @@ -0,0 +1,171 @@ +""" +PitchBook Companies Information dataset. + +Private equity and venture capital company data including +financing rounds, investments, and deal information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "url", + "id", + "company_name", + "company_socials", + "year_founded", + "status", + "employees", + "description", + "contact_information", +] + +DEAL_FIELDS = [ + "latest_deal_type", + "latest_deal_amount", + "latest_deal_amount_value", + "latest_deal_date", + "financing_rounds", +] + +INVESTMENT_FIELDS = [ + "investments", + "all_investments", +] + +IP_FIELDS = [ + "patents", + "patent_activity", +] + +OTHER_FIELDS = [ + "competitors", + "research_analysis", + "faq", +] + + +class PitchBookCompanies(BaseDataset): + """ + PitchBook Companies Information dataset. + + Private company data from PitchBook including financing history, + deals, investments, patents, and competitive analysis. + + Field Categories: + - Company: Basic info, socials, status, employees + - Deals: Latest deal info, financing rounds + - Investments: Investment history + - IP: Patents and patent activity + - Other: Competitors, research, FAQ + + Example: + >>> pitchbook = client.datasets.pitchbook_companies + >>> # Discover available fields + >>> metadata = await pitchbook.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by status + >>> snapshot_id = await pitchbook( + ... filter={"name": "status", "operator": "=", "value": "Private"}, + ... records_limit=100 + ... ) + >>> data = await pitchbook.download(snapshot_id) + """ + + DATASET_ID = "gd_m4ijiqfp2n9oe3oluj" + NAME = "pitchbook_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company identity field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_deal_fields() -> List[str]: + """Get deal-related field names.""" + return DEAL_FIELDS.copy() + + @staticmethod + def get_investment_fields() -> List[str]: + """Get investment field names.""" + return INVESTMENT_FIELDS.copy() + + @staticmethod + def get_ip_fields() -> List[str]: + """Get intellectual property field names.""" + return IP_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "deals": [], + "investments": [], + "ip": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + deal_set = set(DEAL_FIELDS) + investment_set = set(INVESTMENT_FIELDS) + ip_set = set(IP_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set: + result["company"].append(name) + elif name in deal_set or "deal" in name or "financing" in name: + result["deals"].append(name) + elif name in investment_set or "investment" in name: + result["investments"].append(name) + elif name in ip_set or "patent" in name: + result["ip"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "id", + "url", + ] diff --git a/src/brightdata/datasets/prada/__init__.py b/src/brightdata/datasets/prada/__init__.py new file mode 100644 index 0000000..91c708e --- /dev/null +++ b/src/brightdata/datasets/prada/__init__.py @@ -0,0 +1,5 @@ +"""Prada datasets.""" + +from .products import PradaProducts + +__all__ = ["PradaProducts"] diff --git a/src/brightdata/datasets/prada/products.py b/src/brightdata/datasets/prada/products.py new file mode 100644 index 0000000..e5b2f4d --- /dev/null +++ b/src/brightdata/datasets/prada/products.py @@ -0,0 +1,25 @@ +""" +Prada Products dataset. + +Luxury product listings from Prada. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class PradaProducts(BaseDataset): + """Prada Products dataset.""" + + DATASET_ID = "gd_lhahqiq52egng5v35i" + NAME = "prada_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/properati/__init__.py b/src/brightdata/datasets/properati/__init__.py new file mode 100644 index 0000000..7d7234f --- /dev/null +++ b/src/brightdata/datasets/properati/__init__.py @@ -0,0 +1,5 @@ +"""Properati datasets.""" + +from .properties import ProperatiProperties + +__all__ = ["ProperatiProperties"] diff --git a/src/brightdata/datasets/properati/properties.py b/src/brightdata/datasets/properati/properties.py new file mode 100644 index 0000000..34d8a1c --- /dev/null +++ b/src/brightdata/datasets/properati/properties.py @@ -0,0 +1,25 @@ +""" +Properati Properties dataset. + +Real estate property listings from Properati (Argentina and Colombia). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ProperatiProperties(BaseDataset): + """Properati Properties dataset.""" + + DATASET_ID = "gd_lg3nvn6ibrhbotstw" + NAME = "properati_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/raymourflanigan/__init__.py b/src/brightdata/datasets/raymourflanigan/__init__.py new file mode 100644 index 0000000..cc1800f --- /dev/null +++ b/src/brightdata/datasets/raymourflanigan/__init__.py @@ -0,0 +1,5 @@ +"""Raymour and Flanigan datasets.""" + +from .products import RaymourFlaniganProducts + +__all__ = ["RaymourFlaniganProducts"] diff --git a/src/brightdata/datasets/raymourflanigan/products.py b/src/brightdata/datasets/raymourflanigan/products.py new file mode 100644 index 0000000..664e886 --- /dev/null +++ b/src/brightdata/datasets/raymourflanigan/products.py @@ -0,0 +1,25 @@ +""" +Raymour and Flanigan Products dataset. + +Furniture product listings from Raymour and Flanigan. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class RaymourFlaniganProducts(BaseDataset): + """Raymour and Flanigan Products dataset.""" + + DATASET_ID = "gd_lf8cwb8wxoiqarizb" + NAME = "raymourflanigan_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/real_estate/__init__.py b/src/brightdata/datasets/real_estate/__init__.py new file mode 100644 index 0000000..9346a6d --- /dev/null +++ b/src/brightdata/datasets/real_estate/__init__.py @@ -0,0 +1,5 @@ +"""Real Estate datasets.""" + +from .australia import AustraliaRealEstate + +__all__ = ["AustraliaRealEstate"] diff --git a/src/brightdata/datasets/real_estate/australia.py b/src/brightdata/datasets/real_estate/australia.py new file mode 100644 index 0000000..f5bfe86 --- /dev/null +++ b/src/brightdata/datasets/real_estate/australia.py @@ -0,0 +1,39 @@ +""" +Australia Real Estate Properties dataset. + +Property listings from Australia with prices, locations, and details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class AustraliaRealEstate(BaseDataset): + """ + Australia Real Estate Properties dataset. + + Property listings with prices, locations, features, + and agent information. + + Example: + >>> properties = client.datasets.australia_real_estate + >>> metadata = await properties.get_metadata() + >>> snapshot_id = await properties( + ... filter={"name": "state", "operator": "=", "value": "NSW"}, + ... records_limit=100 + ... ) + >>> data = await properties.download(snapshot_id) + """ + + DATASET_ID = "gd_l3cvjh111l943r4awk" + NAME = "australia_real_estate" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/sephora/__init__.py b/src/brightdata/datasets/sephora/__init__.py new file mode 100644 index 0000000..3ac0948 --- /dev/null +++ b/src/brightdata/datasets/sephora/__init__.py @@ -0,0 +1,5 @@ +"""Sephora datasets.""" + +from .products import SephoraProducts + +__all__ = ["SephoraProducts"] diff --git a/src/brightdata/datasets/sephora/products.py b/src/brightdata/datasets/sephora/products.py new file mode 100644 index 0000000..f901f43 --- /dev/null +++ b/src/brightdata/datasets/sephora/products.py @@ -0,0 +1,25 @@ +""" +Sephora Products dataset. + +Beauty and cosmetics product listings from Sephora. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class SephoraProducts(BaseDataset): + """Sephora Products dataset.""" + + DATASET_ID = "gd_lbz49igcthopwaygd" + NAME = "sephora_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/shein/__init__.py b/src/brightdata/datasets/shein/__init__.py new file mode 100644 index 0000000..55d99a1 --- /dev/null +++ b/src/brightdata/datasets/shein/__init__.py @@ -0,0 +1,5 @@ +"""Shein datasets.""" + +from .products import SheinProducts + +__all__ = ["SheinProducts"] diff --git a/src/brightdata/datasets/shein/products.py b/src/brightdata/datasets/shein/products.py new file mode 100644 index 0000000..3f61959 --- /dev/null +++ b/src/brightdata/datasets/shein/products.py @@ -0,0 +1,25 @@ +""" +Shein Products dataset. + +Fashion product listings from Shein. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class SheinProducts(BaseDataset): + """Shein Products dataset.""" + + DATASET_ID = "gd_lemu5ceq1jxjo7vzit" + NAME = "shein_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/shopee/__init__.py b/src/brightdata/datasets/shopee/__init__.py new file mode 100644 index 0000000..6cc5e71 --- /dev/null +++ b/src/brightdata/datasets/shopee/__init__.py @@ -0,0 +1,5 @@ +"""Shopee datasets.""" + +from .products import ShopeeProducts + +__all__ = ["ShopeeProducts"] diff --git a/src/brightdata/datasets/shopee/products.py b/src/brightdata/datasets/shopee/products.py new file mode 100644 index 0000000..7fdc62a --- /dev/null +++ b/src/brightdata/datasets/shopee/products.py @@ -0,0 +1,25 @@ +""" +Shopee Products dataset. + +Product listings from Shopee e-commerce platform. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ShopeeProducts(BaseDataset): + """Shopee Products dataset.""" + + DATASET_ID = "gd_lk122xxgf86xf97py" + NAME = "shopee_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/sleepnumber/__init__.py b/src/brightdata/datasets/sleepnumber/__init__.py new file mode 100644 index 0000000..988c52a --- /dev/null +++ b/src/brightdata/datasets/sleepnumber/__init__.py @@ -0,0 +1,5 @@ +"""Sleep Number datasets.""" + +from .products import SleepNumberProducts + +__all__ = ["SleepNumberProducts"] diff --git a/src/brightdata/datasets/sleepnumber/products.py b/src/brightdata/datasets/sleepnumber/products.py new file mode 100644 index 0000000..d171442 --- /dev/null +++ b/src/brightdata/datasets/sleepnumber/products.py @@ -0,0 +1,25 @@ +""" +Sleep Number Products dataset. + +Mattress and bedding product listings from Sleep Number. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class SleepNumberProducts(BaseDataset): + """Sleep Number Products dataset.""" + + DATASET_ID = "gd_lf8ctgxj1dpkzvl862" + NAME = "sleepnumber_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/slintel/__init__.py b/src/brightdata/datasets/slintel/__init__.py new file mode 100644 index 0000000..33b2352 --- /dev/null +++ b/src/brightdata/datasets/slintel/__init__.py @@ -0,0 +1,5 @@ +"""Slintel/6sense datasets.""" + +from .companies import SlintelCompanies + +__all__ = ["SlintelCompanies"] diff --git a/src/brightdata/datasets/slintel/companies.py b/src/brightdata/datasets/slintel/companies.py new file mode 100644 index 0000000..6115b4e --- /dev/null +++ b/src/brightdata/datasets/slintel/companies.py @@ -0,0 +1,152 @@ +""" +Slintel 6sense Company Information dataset. + +Company profiles from 6sense/Slintel with technographics, +industry data, and business information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "name", + "about", + "num_employees", + "type", + "industries", + "website", + "logo", + "location", + "region", + "country_code", + "id", + "url", + "stock_symbol", +] + +TECH_FIELDS = [ + "techstack_arr", + "slintel_resources", +] + +SOCIAL_FIELDS = [ + "social_media_urls", + "company_news", + "last_updated", +] + + +class SlintelCompanies(BaseDataset): + """ + Slintel 6sense Company Information dataset. + + Company profiles with technographics, industry classification, + and business intelligence data. + + Field Categories: + - Company: Name, description, size, type, industry, location + - Tech: Technology stack, Slintel resources + - Social: Social media URLs, company news, last updated + + Example: + >>> slintel = client.datasets.slintel_companies + >>> # Discover available fields + >>> metadata = await slintel.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by country + >>> snapshot_id = await slintel( + ... filter={"name": "country_code", "operator": "=", "value": "US"}, + ... records_limit=100 + ... ) + >>> data = await slintel.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vilg5a1decoahvgq" + NAME = "slintel_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_tech_fields() -> List[str]: + """Get technology-related field names.""" + return TECH_FIELDS.copy() + + @staticmethod + def get_social_fields() -> List[str]: + """Get social and news field names.""" + return SOCIAL_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "tech": [], + "social": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + tech_set = set(TECH_FIELDS) + social_set = set(SOCIAL_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set: + result["company"].append(name) + elif name in tech_set or "tech" in name.lower(): + result["tech"].append(name) + elif name in social_set or "social" in name or "news" in name: + result["social"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "id", + "url", + "website", + ] diff --git a/src/brightdata/datasets/tiktok/__init__.py b/src/brightdata/datasets/tiktok/__init__.py new file mode 100644 index 0000000..8569218 --- /dev/null +++ b/src/brightdata/datasets/tiktok/__init__.py @@ -0,0 +1,5 @@ +"""TikTok datasets.""" + +from .profiles import TikTokProfiles + +__all__ = ["TikTokProfiles"] diff --git a/src/brightdata/datasets/tiktok/profiles.py b/src/brightdata/datasets/tiktok/profiles.py new file mode 100644 index 0000000..f3164df --- /dev/null +++ b/src/brightdata/datasets/tiktok/profiles.py @@ -0,0 +1,39 @@ +""" +TikTok Profiles dataset. + +TikTok user profiles with follower counts, bio, and engagement data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class TikTokProfiles(BaseDataset): + """ + TikTok Profiles dataset. + + User profiles with follower metrics, bio information, + and engagement statistics. + + Example: + >>> profiles = client.datasets.tiktok_profiles + >>> metadata = await profiles.get_metadata() + >>> snapshot_id = await profiles( + ... filter={"name": "followers", "operator": ">=", "value": "10000"}, + ... records_limit=100 + ... ) + >>> data = await profiles.download(snapshot_id) + """ + + DATASET_ID = "gd_l1villgoiiidt09ci" + NAME = "tiktok_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/toctoc/__init__.py b/src/brightdata/datasets/toctoc/__init__.py new file mode 100644 index 0000000..19ea8da --- /dev/null +++ b/src/brightdata/datasets/toctoc/__init__.py @@ -0,0 +1,5 @@ +"""Toctoc datasets.""" + +from .properties import ToctocProperties + +__all__ = ["ToctocProperties"] diff --git a/src/brightdata/datasets/toctoc/properties.py b/src/brightdata/datasets/toctoc/properties.py new file mode 100644 index 0000000..9937c33 --- /dev/null +++ b/src/brightdata/datasets/toctoc/properties.py @@ -0,0 +1,25 @@ +""" +Toctoc Properties dataset. + +Real estate property listings from Toctoc. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ToctocProperties(BaseDataset): + """Toctoc Properties dataset.""" + + DATASET_ID = "gd_lgfdx3l01behlrboh7" + NAME = "toctoc_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/toysrus/__init__.py b/src/brightdata/datasets/toysrus/__init__.py new file mode 100644 index 0000000..3364e66 --- /dev/null +++ b/src/brightdata/datasets/toysrus/__init__.py @@ -0,0 +1,5 @@ +"""Toys R Us datasets.""" + +from .products import ToysRUsProducts + +__all__ = ["ToysRUsProducts"] diff --git a/src/brightdata/datasets/toysrus/products.py b/src/brightdata/datasets/toysrus/products.py new file mode 100644 index 0000000..fbe4cf0 --- /dev/null +++ b/src/brightdata/datasets/toysrus/products.py @@ -0,0 +1,25 @@ +""" +Toys R Us Products dataset. + +Toy product listings from Toys R Us. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ToysRUsProducts(BaseDataset): + """Toys R Us Products dataset.""" + + DATASET_ID = "gd_lemuapao1lkjggvn05" + NAME = "toysrus_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/trustpilot/__init__.py b/src/brightdata/datasets/trustpilot/__init__.py new file mode 100644 index 0000000..223fc09 --- /dev/null +++ b/src/brightdata/datasets/trustpilot/__init__.py @@ -0,0 +1,5 @@ +"""Trustpilot datasets.""" + +from .reviews import TrustpilotReviews + +__all__ = ["TrustpilotReviews"] diff --git a/src/brightdata/datasets/trustpilot/reviews.py b/src/brightdata/datasets/trustpilot/reviews.py new file mode 100644 index 0000000..190e9fe --- /dev/null +++ b/src/brightdata/datasets/trustpilot/reviews.py @@ -0,0 +1,185 @@ +""" +Trustpilot Business Reviews dataset. + +Business reviews from Trustpilot with company info, ratings, +and reviewer details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "company_name", + "company_id", + "company_logo", + "company_website", + "company_rating_name", + "company_overall_rating", + "is_verified_company", + "company_total_reviews", + "company_about", + "company_email", + "company_phone", + "company_location", + "company_country", + "company_category", + "company_other_categories", + "company activity", + "breadcrumbs", +] + +RATING_DISTRIBUTION_FIELDS = [ + "5_star", + "4_star", + "3_star", + "2_star", + "1_star", +] + +REVIEW_FIELDS = [ + "review_id", + "review_date", + "review_rating", + "review_title", + "review_content", + "is_verified_review", + "review_date_of_experience", + "review_replies", + "review_useful_count", + "review_url", + "date_posted", + "url", +] + +REVIEWER_FIELDS = [ + "reviewer_name", + "reviewer_location", + "reviews_posted_overall", +] + + +class TrustpilotReviews(BaseDataset): + """ + Trustpilot Business Reviews dataset. + + Business reviews with company profiles, rating distributions, + and detailed review content from Trustpilot. + + Field Categories: + - Company: Name, website, rating, verification status + - Rating Distribution: Star rating breakdown (1-5 stars) + - Review: Content, date, rating, replies + - Reviewer: Name, location, review count + + Example: + >>> trustpilot = client.datasets.trustpilot_reviews + >>> # Discover available fields + >>> metadata = await trustpilot.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by rating + >>> snapshot_id = await trustpilot( + ... filter={"name": "review_rating", "operator": ">=", "value": "4"}, + ... records_limit=100 + ... ) + >>> data = await trustpilot.download(snapshot_id) + """ + + DATASET_ID = "gd_lm5zmhwd2sni130p" + NAME = "trustpilot_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_rating_distribution_fields() -> List[str]: + """Get rating distribution field names.""" + return RATING_DISTRIBUTION_FIELDS.copy() + + @staticmethod + def get_review_fields() -> List[str]: + """Get review content field names.""" + return REVIEW_FIELDS.copy() + + @staticmethod + def get_reviewer_fields() -> List[str]: + """Get reviewer-related field names.""" + return REVIEWER_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "rating_distribution": [], + "review": [], + "reviewer": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + rating_set = set(RATING_DISTRIBUTION_FIELDS) + review_set = set(REVIEW_FIELDS) + reviewer_set = set(REVIEWER_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set or name.startswith("company_"): + result["company"].append(name) + elif name in rating_set or name.endswith("_star"): + result["rating_distribution"].append(name) + elif name in review_set or name.startswith("review"): + result["review"].append(name) + elif name in reviewer_set or name.startswith("reviewer"): + result["reviewer"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "review_id", + "company_id", + "review_url", + "url", + ] diff --git a/src/brightdata/datasets/trustradius/__init__.py b/src/brightdata/datasets/trustradius/__init__.py new file mode 100644 index 0000000..b9331aa --- /dev/null +++ b/src/brightdata/datasets/trustradius/__init__.py @@ -0,0 +1,5 @@ +"""TrustRadius datasets.""" + +from .reviews import TrustRadiusReviews + +__all__ = ["TrustRadiusReviews"] diff --git a/src/brightdata/datasets/trustradius/reviews.py b/src/brightdata/datasets/trustradius/reviews.py new file mode 100644 index 0000000..4e0cae4 --- /dev/null +++ b/src/brightdata/datasets/trustradius/reviews.py @@ -0,0 +1,211 @@ +""" +TrustRadius Product Reviews dataset. + +Software product reviews from TrustRadius with detailed +ratings, pros/cons, and reviewer information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +PRODUCT_FIELDS = [ + "url", + "product_id", + "product_name", +] + +REVIEW_FIELDS = [ + "review_id", + "review_url", + "review_title", + "review_rating", + "review_date", + "usability_rating", + "implementation_rating", + "support_rating", + "likelihood_to_recommend", + "likelihood_to_renew", + "start_date", + "updated_date", + "author_incentivized", +] + +AUTHOR_FIELDS = [ + "review_author", + "author_position", + "author_company_name", + "author_company_industry", + "author_company_size", + "author_labels", + "author_experience_years", + "author_linkedin_url", + "author_image", +] + +CONTENT_FIELDS = [ + "pros", + "cons", + "pros_cons", + "usecases_deployment_scope", + "return_on_investment", + "efficiencies_gained", + "key_insights", + "usability_pros", + "usability_cons", + "easy_tasks", + "difficult_tasks", + "support_pros", + "support_cons", + "implementation_issues", + "implementation_partner", +] + +PRODUCT_DETAILS_FIELDS = [ + "alternatives_considered", + "other_software_used", + "users_and_roles", + "support_headcount_required", + "business_processes_supported", + "innovative_uses", + "future_planned_uses", + "products_replaced", + "key_differentiators", + "feature_ratings", +] + + +class TrustRadiusReviews(BaseDataset): + """ + TrustRadius Product Reviews dataset. + + Software product reviews with detailed ratings, + pros/cons analysis, and reviewer company information. + + Field Categories: + - Product: ID, name, URL + - Review: Ratings, dates, recommendation scores + - Author: Reviewer info, company, experience + - Content: Pros, cons, insights, ROI + - Product Details: Alternatives, features, use cases + + Example: + >>> reviews = client.datasets.trustradius_reviews + >>> # Discover available fields + >>> metadata = await reviews.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by rating + >>> snapshot_id = await reviews( + ... filter={"name": "review_rating", "operator": ">=", "value": "8"}, + ... records_limit=100 + ... ) + >>> data = await reviews.download(snapshot_id) + """ + + DATASET_ID = "gd_lztojazw1389985ops" + NAME = "trustradius_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_product_fields() -> List[str]: + """Get product-related field names.""" + return PRODUCT_FIELDS.copy() + + @staticmethod + def get_review_fields() -> List[str]: + """Get review-related field names.""" + return REVIEW_FIELDS.copy() + + @staticmethod + def get_author_fields() -> List[str]: + """Get author-related field names.""" + return AUTHOR_FIELDS.copy() + + @staticmethod + def get_content_fields() -> List[str]: + """Get content-related field names.""" + return CONTENT_FIELDS.copy() + + @staticmethod + def get_product_details_fields() -> List[str]: + """Get product details field names.""" + return PRODUCT_DETAILS_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "product": [], + "review": [], + "author": [], + "content": [], + "product_details": [], + "other": [], + } + + product_set = set(PRODUCT_FIELDS) + review_set = set(REVIEW_FIELDS) + author_set = set(AUTHOR_FIELDS) + content_set = set(CONTENT_FIELDS) + details_set = set(PRODUCT_DETAILS_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in product_set or name.startswith("product"): + result["product"].append(name) + elif name in review_set or (name.startswith("review") and name not in author_set): + result["review"].append(name) + elif name in author_set or name.startswith("author"): + result["author"].append(name) + elif name in content_set or name in ["pros", "cons"]: + result["content"].append(name) + elif name in details_set: + result["product_details"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "review_id", + "product_id", + "review_url", + "url", + ] diff --git a/src/brightdata/datasets/ventureradar/__init__.py b/src/brightdata/datasets/ventureradar/__init__.py new file mode 100644 index 0000000..edc4e21 --- /dev/null +++ b/src/brightdata/datasets/ventureradar/__init__.py @@ -0,0 +1,5 @@ +"""VentureRadar datasets.""" + +from .companies import VentureRadarCompanies + +__all__ = ["VentureRadarCompanies"] diff --git a/src/brightdata/datasets/ventureradar/companies.py b/src/brightdata/datasets/ventureradar/companies.py new file mode 100644 index 0000000..dc83f50 --- /dev/null +++ b/src/brightdata/datasets/ventureradar/companies.py @@ -0,0 +1,173 @@ +""" +VentureRadar Company Information dataset. + +Company profiles from VentureRadar with startup intelligence, +funding signals, and competitive data. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "url", + "name", + "title", + "description", + "description_source", + "location", + "country_code", + "ownership", + "founded", + "company_id", + "website", +] + +SCORES_FIELDS = [ + "score", + "auto_analyst_score", + "website_popularity", + "website_popularity_graph", + "sub_scores", +] + +CONTACT_FIELDS = [ + "email", + "linkedin", + "twitter", + "keywords", +] + +INTELLIGENCE_FIELDS = [ + "similar", + "hostorical_profiles", + "areas_of_focus", + "cards", + "funding_signals", + "employee_satisfaction", +] + + +class VentureRadarCompanies(BaseDataset): + """ + VentureRadar Company Information dataset. + + Startup and company profiles with analyst scores, + funding signals, and competitive intelligence. + + Field Categories: + - Company: Name, description, location, ownership, founded + - Scores: Analyst scores, website popularity, sub-scores + - Contact: Email, LinkedIn, Twitter, keywords + - Intelligence: Similar companies, funding signals, focus areas + + Example: + >>> vr = client.datasets.ventureradar_companies + >>> # Discover available fields + >>> metadata = await vr.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by country + >>> snapshot_id = await vr( + ... filter={"name": "country_code", "operator": "=", "value": "US"}, + ... records_limit=100 + ... ) + >>> data = await vr.download(snapshot_id) + """ + + DATASET_ID = "gd_l1vilsfd1xpsndbtpr" + NAME = "ventureradar_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company-related field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_scores_fields() -> List[str]: + """Get scores and metrics field names.""" + return SCORES_FIELDS.copy() + + @staticmethod + def get_contact_fields() -> List[str]: + """Get contact and social field names.""" + return CONTACT_FIELDS.copy() + + @staticmethod + def get_intelligence_fields() -> List[str]: + """Get business intelligence field names.""" + return INTELLIGENCE_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "scores": [], + "contact": [], + "intelligence": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + scores_set = set(SCORES_FIELDS) + contact_set = set(CONTACT_FIELDS) + intelligence_set = set(INTELLIGENCE_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set: + result["company"].append(name) + elif name in scores_set or "score" in name.lower() or "popularity" in name.lower(): + result["scores"].append(name) + elif name in contact_set: + result["contact"].append(name) + elif name in intelligence_set or "funding" in name.lower() or "similar" in name.lower(): + result["intelligence"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "company_id", + "url", + "website", + ] diff --git a/src/brightdata/datasets/walmart/__init__.py b/src/brightdata/datasets/walmart/__init__.py new file mode 100644 index 0000000..b7541d2 --- /dev/null +++ b/src/brightdata/datasets/walmart/__init__.py @@ -0,0 +1,5 @@ +"""Walmart datasets.""" + +from .products import WalmartProducts + +__all__ = ["WalmartProducts"] diff --git a/src/brightdata/datasets/walmart/products.py b/src/brightdata/datasets/walmart/products.py new file mode 100644 index 0000000..53f4cc2 --- /dev/null +++ b/src/brightdata/datasets/walmart/products.py @@ -0,0 +1,39 @@ +""" +Walmart Products dataset. + +Product listings from Walmart with prices, ratings, and details. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WalmartProducts(BaseDataset): + """ + Walmart Products dataset. + + Product listings with prices, ratings, availability, + and seller information. + + Example: + >>> products = client.datasets.walmart_products + >>> metadata = await products.get_metadata() + >>> snapshot_id = await products( + ... filter={"name": "category", "operator": "=", "value": "Electronics"}, + ... records_limit=100 + ... ) + >>> data = await products.download(snapshot_id) + """ + + DATASET_ID = "gd_l95fol7l1ru6rlo116" + NAME = "walmart_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/webmotors/__init__.py b/src/brightdata/datasets/webmotors/__init__.py new file mode 100644 index 0000000..ad73f5d --- /dev/null +++ b/src/brightdata/datasets/webmotors/__init__.py @@ -0,0 +1,5 @@ +"""Webmotors datasets.""" + +from .vehicles import WebmotorsBrasil + +__all__ = ["WebmotorsBrasil"] diff --git a/src/brightdata/datasets/webmotors/vehicles.py b/src/brightdata/datasets/webmotors/vehicles.py new file mode 100644 index 0000000..443570c --- /dev/null +++ b/src/brightdata/datasets/webmotors/vehicles.py @@ -0,0 +1,25 @@ +""" +Webmotors Brasil dataset. + +Vehicle listings from Webmotors (Brazil). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WebmotorsBrasil(BaseDataset): + """Webmotors Brasil vehicle listings dataset.""" + + DATASET_ID = "gd_ld73zt91j10sphddj" + NAME = "webmotors_brasil" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/world_zipcodes/__init__.py b/src/brightdata/datasets/world_zipcodes/__init__.py new file mode 100644 index 0000000..12e2dec --- /dev/null +++ b/src/brightdata/datasets/world_zipcodes/__init__.py @@ -0,0 +1,5 @@ +"""World Zipcodes datasets.""" + +from .zipcodes import WorldZipcodes + +__all__ = ["WorldZipcodes"] diff --git a/src/brightdata/datasets/world_zipcodes/zipcodes.py b/src/brightdata/datasets/world_zipcodes/zipcodes.py new file mode 100644 index 0000000..f406a26 --- /dev/null +++ b/src/brightdata/datasets/world_zipcodes/zipcodes.py @@ -0,0 +1,25 @@ +""" +World Zipcodes dataset. + +Global postal/zip code information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class WorldZipcodes(BaseDataset): + """World Zipcodes dataset.""" + + DATASET_ID = "gd_licvqc95ta2552qxu" + NAME = "world_zipcodes" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/xing/__init__.py b/src/brightdata/datasets/xing/__init__.py new file mode 100644 index 0000000..4e42111 --- /dev/null +++ b/src/brightdata/datasets/xing/__init__.py @@ -0,0 +1,5 @@ +"""Xing datasets.""" + +from .profiles import XingProfiles + +__all__ = ["XingProfiles"] diff --git a/src/brightdata/datasets/xing/profiles.py b/src/brightdata/datasets/xing/profiles.py new file mode 100644 index 0000000..8469841 --- /dev/null +++ b/src/brightdata/datasets/xing/profiles.py @@ -0,0 +1,153 @@ +""" +Xing Social Network Profiles dataset. + +Professional profiles from Xing with experience, education, +skills, and contact information. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +PROFILE_FIELDS = [ + "account_id", + "familyName", + "givenName", + "name", + "gender", + "membership", + "country_code", + "honorificPrefix", + "jobTitle", + "image", + "url", + "addressLocality", +] + +EXPERIENCE_FIELDS = [ + "experience", + "education", + "languages", + "skills", +] + +SOCIAL_FIELDS = [ + "groups", + "interests", + "similar_profiles", + "wants", +] + + +class XingProfiles(BaseDataset): + """ + Xing Social Network Profiles dataset. + + Professional profiles with career history, education, + skills, and networking information. + + Field Categories: + - Profile: Name, gender, membership, location, job title + - Experience: Work history, education, languages, skills + - Social: Groups, interests, similar profiles, wants + + Example: + >>> xing = client.datasets.xing_profiles + >>> # Discover available fields + >>> metadata = await xing.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by country + >>> snapshot_id = await xing( + ... filter={"name": "country_code", "operator": "=", "value": "DE"}, + ... records_limit=100 + ... ) + >>> data = await xing.download(snapshot_id) + """ + + DATASET_ID = "gd_l3lh4ev31oqrvvblv6" + NAME = "xing_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_profile_fields() -> List[str]: + """Get profile-related field names.""" + return PROFILE_FIELDS.copy() + + @staticmethod + def get_experience_fields() -> List[str]: + """Get experience and education field names.""" + return EXPERIENCE_FIELDS.copy() + + @staticmethod + def get_social_fields() -> List[str]: + """Get social networking field names.""" + return SOCIAL_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "profile": [], + "experience": [], + "social": [], + "other": [], + } + + profile_set = set(PROFILE_FIELDS) + experience_set = set(EXPERIENCE_FIELDS) + social_set = set(SOCIAL_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in profile_set: + result["profile"].append(name) + elif name in experience_set or "experience" in name or "education" in name: + result["experience"].append(name) + elif name in social_set or "group" in name or "interest" in name: + result["social"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "account_id", + "url", + ] diff --git a/src/brightdata/datasets/yapo/__init__.py b/src/brightdata/datasets/yapo/__init__.py new file mode 100644 index 0000000..729d864 --- /dev/null +++ b/src/brightdata/datasets/yapo/__init__.py @@ -0,0 +1,5 @@ +"""Yapo datasets.""" + +from .ads import YapoChile + +__all__ = ["YapoChile"] diff --git a/src/brightdata/datasets/yapo/ads.py b/src/brightdata/datasets/yapo/ads.py new file mode 100644 index 0000000..b19d8c2 --- /dev/null +++ b/src/brightdata/datasets/yapo/ads.py @@ -0,0 +1,25 @@ +""" +Yapo Chile dataset. + +Marketplace ads from Yapo Chile. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class YapoChile(BaseDataset): + """Yapo Chile marketplace ads dataset.""" + + DATASET_ID = "gd_lgfcz12mk6og7lvhs" + NAME = "yapo_chile" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/yelp/__init__.py b/src/brightdata/datasets/yelp/__init__.py new file mode 100644 index 0000000..23e6be6 --- /dev/null +++ b/src/brightdata/datasets/yelp/__init__.py @@ -0,0 +1,6 @@ +"""Yelp datasets.""" + +from .businesses import YelpBusinesses +from .reviews import YelpReviews + +__all__ = ["YelpBusinesses", "YelpReviews"] diff --git a/src/brightdata/datasets/yelp/businesses.py b/src/brightdata/datasets/yelp/businesses.py new file mode 100644 index 0000000..b7f26ba --- /dev/null +++ b/src/brightdata/datasets/yelp/businesses.py @@ -0,0 +1,187 @@ +""" +Yelp Businesses Overview dataset. + +Business listings from Yelp with ratings, location, and amenities. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +BUSINESS_FIELDS = [ + "business_id", + "yelp_biz_id", + "name", + "is_claimed", + "is_closed", +] + +LOCATION_FIELDS = [ + "address", + "full_address", + "city", + "state", + "country", + "zip_code", + "latitude", + "longitude", + "service_area", +] + +CONTACT_FIELDS = [ + "website", + "phone_number", + "opening_hours", + "url", +] + +RATING_FIELDS = [ + "overall_rating", + "reviews_count", + "price_range", +] + +CONTENT_FIELDS = [ + "categories", + "amenities", + "about_the_business", + "highlights", + "services_offered", + "updates_from_business", + "images_videos_urls", +] + + +class YelpBusinesses(BaseDataset): + """ + Yelp Businesses Overview dataset. + + Business listings with ratings, location data, amenities, + and contact information from Yelp. + + Field Categories: + - Business: ID, name, claimed status + - Location: Address, city, state, coordinates + - Contact: Website, phone, hours + - Ratings: Overall rating, review count, price range + - Content: Categories, amenities, services, photos + + Example: + >>> yelp = client.datasets.yelp_businesses + >>> # Discover available fields + >>> metadata = await yelp.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by rating + >>> snapshot_id = await yelp( + ... filter={"name": "overall_rating", "operator": ">=", "value": "4"}, + ... records_limit=100 + ... ) + >>> data = await yelp.download(snapshot_id) + """ + + DATASET_ID = "gd_lgugwl0519h1p14rwk" + NAME = "yelp_businesses" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_business_fields() -> List[str]: + """Get business identity field names.""" + return BUSINESS_FIELDS.copy() + + @staticmethod + def get_location_fields() -> List[str]: + """Get location-related field names.""" + return LOCATION_FIELDS.copy() + + @staticmethod + def get_contact_fields() -> List[str]: + """Get contact information field names.""" + return CONTACT_FIELDS.copy() + + @staticmethod + def get_rating_fields() -> List[str]: + """Get rating-related field names.""" + return RATING_FIELDS.copy() + + @staticmethod + def get_content_fields() -> List[str]: + """Get content field names.""" + return CONTENT_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "business": [], + "location": [], + "contact": [], + "rating": [], + "content": [], + "other": [], + } + + business_set = set(BUSINESS_FIELDS) + location_set = set(LOCATION_FIELDS) + contact_set = set(CONTACT_FIELDS) + rating_set = set(RATING_FIELDS) + content_set = set(CONTENT_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in business_set: + result["business"].append(name) + elif name in location_set: + result["location"].append(name) + elif name in contact_set: + result["contact"].append(name) + elif name in rating_set: + result["rating"].append(name) + elif name in content_set: + result["content"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "business_id", + "yelp_biz_id", + "url", + ] diff --git a/src/brightdata/datasets/yelp/reviews.py b/src/brightdata/datasets/yelp/reviews.py new file mode 100644 index 0000000..3a9f04e --- /dev/null +++ b/src/brightdata/datasets/yelp/reviews.py @@ -0,0 +1,151 @@ +""" +Yelp Business Reviews dataset. + +Individual business reviews from Yelp with reviewer details, +ratings, and reactions. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +BUSINESS_FIELDS = [ + "business_id", + "business_name", + "url", +] + +REVIEW_FIELDS = [ + "review_id", + "rating", + "date", + "date_iso_format", + "content", + "review_image", + "reactions", + "replies", + "review_order", + "recommended_review", +] + +REVIEWER_FIELDS = [ + "review_auther", # JSON with Friends, Image, Location, Photos, Reviews_made, URL + "profile_pic_url", + "elite_status", + "check-in_status", +] + + +class YelpReviews(BaseDataset): + """ + Yelp Business Reviews dataset. + + Individual reviews for businesses with reviewer information, + ratings, reactions, and reply data. + + Field Categories: + - Business: ID, name, URL + - Review: Content, rating, date, images, reactions, replies + - Reviewer: Author info, profile pic, elite status, check-ins + + Example: + >>> reviews = client.datasets.yelp_reviews + >>> # Discover available fields + >>> metadata = await reviews.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by rating + >>> snapshot_id = await reviews( + ... filter={"name": "rating", "operator": ">=", "value": "4"}, + ... records_limit=100 + ... ) + >>> data = await reviews.download(snapshot_id) + """ + + DATASET_ID = "gd_lgzhlu9323u3k24jkv" + NAME = "yelp_reviews" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_business_fields() -> List[str]: + """Get business-related field names.""" + return BUSINESS_FIELDS.copy() + + @staticmethod + def get_review_fields() -> List[str]: + """Get review content field names.""" + return REVIEW_FIELDS.copy() + + @staticmethod + def get_reviewer_fields() -> List[str]: + """Get reviewer-related field names.""" + return REVIEWER_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "business": [], + "review": [], + "reviewer": [], + "other": [], + } + + business_set = set(BUSINESS_FIELDS) + review_set = set(REVIEW_FIELDS) + reviewer_set = set(REVIEWER_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in business_set or name.startswith("business"): + result["business"].append(name) + elif name in review_set or name.startswith("review"): + result["review"].append(name) + elif name in reviewer_set or name.startswith("reviewer") or "author" in name.lower(): + result["reviewer"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "review_id", + "business_id", + "url", + ] diff --git a/src/brightdata/datasets/youtube/__init__.py b/src/brightdata/datasets/youtube/__init__.py new file mode 100644 index 0000000..29d8144 --- /dev/null +++ b/src/brightdata/datasets/youtube/__init__.py @@ -0,0 +1,7 @@ +"""YouTube datasets.""" + +from .profiles import YouTubeProfiles +from .videos import YouTubeVideos +from .comments import YouTubeComments + +__all__ = ["YouTubeProfiles", "YouTubeVideos", "YouTubeComments"] diff --git a/src/brightdata/datasets/youtube/comments.py b/src/brightdata/datasets/youtube/comments.py new file mode 100644 index 0000000..81867aa --- /dev/null +++ b/src/brightdata/datasets/youtube/comments.py @@ -0,0 +1,25 @@ +""" +YouTube Comments dataset. + +Comments from YouTube videos. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class YouTubeComments(BaseDataset): + """YouTube Comments dataset.""" + + DATASET_ID = "gd_lk9q0ew71spt1mxywf" + NAME = "youtube_comments" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/youtube/profiles.py b/src/brightdata/datasets/youtube/profiles.py new file mode 100644 index 0000000..35379f8 --- /dev/null +++ b/src/brightdata/datasets/youtube/profiles.py @@ -0,0 +1,25 @@ +""" +YouTube Profiles dataset. + +Channel profiles from YouTube. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class YouTubeProfiles(BaseDataset): + """YouTube Profiles dataset.""" + + DATASET_ID = "gd_lk538t2k2p1k3oos71" + NAME = "youtube_profiles" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/youtube/videos.py b/src/brightdata/datasets/youtube/videos.py new file mode 100644 index 0000000..12de2f9 --- /dev/null +++ b/src/brightdata/datasets/youtube/videos.py @@ -0,0 +1,25 @@ +""" +YouTube Videos dataset. + +Video posts from YouTube. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class YouTubeVideos(BaseDataset): + """YouTube Videos dataset.""" + + DATASET_ID = "gd_lk56epmy2i5g7lzu0k" + NAME = "youtube_videos" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/ysl/__init__.py b/src/brightdata/datasets/ysl/__init__.py new file mode 100644 index 0000000..b961d2e --- /dev/null +++ b/src/brightdata/datasets/ysl/__init__.py @@ -0,0 +1,5 @@ +"""YSL datasets.""" + +from .products import YSLProducts + +__all__ = ["YSLProducts"] diff --git a/src/brightdata/datasets/ysl/products.py b/src/brightdata/datasets/ysl/products.py new file mode 100644 index 0000000..3ca73e6 --- /dev/null +++ b/src/brightdata/datasets/ysl/products.py @@ -0,0 +1,25 @@ +""" +YSL Products dataset. + +Luxury product listings from YSL (Yves Saint Laurent). + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class YSLProducts(BaseDataset): + """YSL Products dataset.""" + + DATASET_ID = "gd_lhai2io04wilkad5z" + NAME = "ysl_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zalando/__init__.py b/src/brightdata/datasets/zalando/__init__.py new file mode 100644 index 0000000..7f8333c --- /dev/null +++ b/src/brightdata/datasets/zalando/__init__.py @@ -0,0 +1,5 @@ +"""Zalando datasets.""" + +from .products import ZalandoProducts + +__all__ = ["ZalandoProducts"] diff --git a/src/brightdata/datasets/zalando/products.py b/src/brightdata/datasets/zalando/products.py new file mode 100644 index 0000000..8708d4e --- /dev/null +++ b/src/brightdata/datasets/zalando/products.py @@ -0,0 +1,25 @@ +""" +Zalando Products dataset. + +Fashion product listings from Zalando. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZalandoProducts(BaseDataset): + """Zalando Products dataset.""" + + DATASET_ID = "gd_lbqj6l5s28ofha6mlk" + NAME = "zalando_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zara/__init__.py b/src/brightdata/datasets/zara/__init__.py new file mode 100644 index 0000000..504dd15 --- /dev/null +++ b/src/brightdata/datasets/zara/__init__.py @@ -0,0 +1,6 @@ +"""Zara datasets.""" + +from .products import ZaraProducts +from .home_products import ZaraHomeProducts + +__all__ = ["ZaraProducts", "ZaraHomeProducts"] diff --git a/src/brightdata/datasets/zara/home_products.py b/src/brightdata/datasets/zara/home_products.py new file mode 100644 index 0000000..32f340c --- /dev/null +++ b/src/brightdata/datasets/zara/home_products.py @@ -0,0 +1,25 @@ +""" +Zara Home Products dataset. + +Home decor product listings from Zara Home. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZaraHomeProducts(BaseDataset): + """Zara Home Products dataset.""" + + DATASET_ID = "gd_lcx5utgek9mxrsiie" + NAME = "zara_home_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zara/products.py b/src/brightdata/datasets/zara/products.py new file mode 100644 index 0000000..5cd494c --- /dev/null +++ b/src/brightdata/datasets/zara/products.py @@ -0,0 +1,25 @@ +""" +Zara Products dataset. + +Fashion product listings from Zara. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZaraProducts(BaseDataset): + """Zara Products dataset.""" + + DATASET_ID = "gd_lct4vafw1tgx27d4o0" + NAME = "zara_products" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zillow/__init__.py b/src/brightdata/datasets/zillow/__init__.py new file mode 100644 index 0000000..f99a42e --- /dev/null +++ b/src/brightdata/datasets/zillow/__init__.py @@ -0,0 +1,5 @@ +"""Zillow datasets.""" + +from .properties import ZillowProperties + +__all__ = ["ZillowProperties"] diff --git a/src/brightdata/datasets/zillow/properties.py b/src/brightdata/datasets/zillow/properties.py new file mode 100644 index 0000000..72c594b --- /dev/null +++ b/src/brightdata/datasets/zillow/properties.py @@ -0,0 +1,25 @@ +""" +Zillow Properties dataset. + +Real estate property listings from Zillow. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZillowProperties(BaseDataset): + """Zillow Properties dataset.""" + + DATASET_ID = "gd_lfqkr8wm13ixtbd8f5" + NAME = "zillow_properties" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zonaprop/__init__.py b/src/brightdata/datasets/zonaprop/__init__.py new file mode 100644 index 0000000..404735d --- /dev/null +++ b/src/brightdata/datasets/zonaprop/__init__.py @@ -0,0 +1,5 @@ +"""Zonaprop datasets.""" + +from .properties import ZonapropArgentina + +__all__ = ["ZonapropArgentina"] diff --git a/src/brightdata/datasets/zonaprop/properties.py b/src/brightdata/datasets/zonaprop/properties.py new file mode 100644 index 0000000..ba29f54 --- /dev/null +++ b/src/brightdata/datasets/zonaprop/properties.py @@ -0,0 +1,25 @@ +""" +Zonaprop Argentina dataset. + +Real estate property listings from Zonaprop Argentina. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +class ZonapropArgentina(BaseDataset): + """Zonaprop Argentina real estate dataset.""" + + DATASET_ID = "gd_lfsbhfgo2bglgrecm6" + NAME = "zonaprop_argentina" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None diff --git a/src/brightdata/datasets/zoominfo/__init__.py b/src/brightdata/datasets/zoominfo/__init__.py new file mode 100644 index 0000000..8a3d4f0 --- /dev/null +++ b/src/brightdata/datasets/zoominfo/__init__.py @@ -0,0 +1,5 @@ +"""ZoomInfo datasets.""" + +from .companies import ZoomInfoCompanies + +__all__ = ["ZoomInfoCompanies"] diff --git a/src/brightdata/datasets/zoominfo/companies.py b/src/brightdata/datasets/zoominfo/companies.py new file mode 100644 index 0000000..25fe578 --- /dev/null +++ b/src/brightdata/datasets/zoominfo/companies.py @@ -0,0 +1,202 @@ +""" +ZoomInfo Companies Information dataset. + +Company and contact data from ZoomInfo including financials, +employee counts, tech stack, and org charts. + +Use get_metadata() to discover all available fields dynamically. +""" + +from typing import TYPE_CHECKING, Dict, List, Optional + +from ..base import BaseDataset + +if TYPE_CHECKING: + from ...core.async_engine import AsyncEngine + + +# Field categories +COMPANY_FIELDS = [ + "url", + "id", + "name", + "description", + "website", + "industry", + "headquarters", + "phone_number", +] + +FINANCIAL_FIELDS = [ + "revenue", + "revenue_currency", + "revenue_text", + "stock_symbol", + "total_funding_amount", + "most_recent_funding_amount", + "funding_currency", + "funding_rounds", +] + +EMPLOYEE_FIELDS = [ + "employees", + "employees_text", + "total_employees", + "c_level_employees", + "vp_level_employees", + "director_level_employees", + "manager_level_employees", + "non_manager_employees", +] + +LEADERSHIP_FIELDS = [ + "leadership", + "ceo", + "top_contacts", + "org_chart", + "ceo_rating", +] + +TECH_FIELDS = [ + "products_owned", + "tech_stack", +] + +OTHER_FIELDS = [ + "popular_searches", + "business_classification_codes", + "social_media", + "enps score", + "similar_companies", + "email_formats", + "recent_scoops", + "news_and_media", +] + + +class ZoomInfoCompanies(BaseDataset): + """ + ZoomInfo Companies Information dataset. + + Comprehensive company data including financials, employee counts, + leadership contacts, tech stack, and organizational structure. + + Field Categories: + - Company: Basic info, website, industry + - Financial: Revenue, funding, stock symbol + - Employees: Counts by level (C-level, VP, Director, etc.) + - Leadership: CEO, contacts, org chart + - Tech: Tech stack, products owned + + Example: + >>> zoominfo = client.datasets.zoominfo_companies + >>> # Discover available fields + >>> metadata = await zoominfo.get_metadata() + >>> print(f"Total fields: {len(metadata.fields)}") + >>> + >>> # Filter by industry + >>> snapshot_id = await zoominfo( + ... filter={"name": "industry", "operator": "=", "value": "Technology"}, + ... records_limit=100 + ... ) + >>> data = await zoominfo.download(snapshot_id) + """ + + DATASET_ID = "gd_m0ci4a4ivx3j5l6nx" + NAME = "zoominfo_companies" + + def __init__(self, engine: "AsyncEngine"): + super().__init__(engine) + self._fields_by_category: Optional[Dict[str, List[str]]] = None + + @staticmethod + def get_company_fields() -> List[str]: + """Get company identity field names.""" + return COMPANY_FIELDS.copy() + + @staticmethod + def get_financial_fields() -> List[str]: + """Get financial-related field names.""" + return FINANCIAL_FIELDS.copy() + + @staticmethod + def get_employee_fields() -> List[str]: + """Get employee count field names.""" + return EMPLOYEE_FIELDS.copy() + + @staticmethod + def get_leadership_fields() -> List[str]: + """Get leadership and contact field names.""" + return LEADERSHIP_FIELDS.copy() + + @staticmethod + def get_tech_fields() -> List[str]: + """Get technology-related field names.""" + return TECH_FIELDS.copy() + + async def get_fields_by_category(self) -> Dict[str, List[str]]: + """Get all fields grouped by category.""" + if self._fields_by_category is not None: + return self._fields_by_category + + metadata = await self.get_metadata() + result: Dict[str, List[str]] = { + "company": [], + "financial": [], + "employees": [], + "leadership": [], + "tech": [], + "other": [], + } + + company_set = set(COMPANY_FIELDS) + financial_set = set(FINANCIAL_FIELDS) + employee_set = set(EMPLOYEE_FIELDS) + leadership_set = set(LEADERSHIP_FIELDS) + tech_set = set(TECH_FIELDS) + + for name, field_info in metadata.fields.items(): + if not field_info.active: + continue + + if name in company_set: + result["company"].append(name) + elif name in financial_set or "funding" in name or "revenue" in name: + result["financial"].append(name) + elif name in employee_set or "employee" in name: + result["employees"].append(name) + elif name in leadership_set: + result["leadership"].append(name) + elif name in tech_set: + result["tech"].append(name) + else: + result["other"].append(name) + + for category in result: + result[category] = sorted(result[category]) + + self._fields_by_category = result + return result + + async def search_fields(self, keyword: str) -> List[str]: + """Search for fields containing a keyword.""" + metadata = await self.get_metadata() + keyword_lower = keyword.lower() + + matches = [] + for name, field_info in metadata.fields.items(): + if keyword_lower in name.lower(): + matches.append(name) + elif field_info.description and keyword_lower in field_info.description.lower(): + matches.append(name) + + return sorted(matches) + + @staticmethod + def get_identifier_fields() -> List[str]: + """Get fields that can be used as unique identifiers.""" + return [ + "id", + "url", + "website", + ] From d5ad7d04daf14d18921e297f504adef6124aa4a5 Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 16 Feb 2026 12:00:05 +0300 Subject: [PATCH 4/5] add notebook and fix version --- CHANGELOG.md | 28 + notebooks/datasets/mass_test.ipynb | 2567 +++++++++++++++++ pyproject.toml | 2 +- src/brightdata/datasets/base.py | 28 + .../datasets/companies_enriched/companies.py | 3 +- .../datasets/employees_enriched/employees.py | 3 +- 6 files changed, 2626 insertions(+), 5 deletions(-) create mode 100644 notebooks/datasets/mass_test.ipynb diff --git a/CHANGELOG.md b/CHANGELOG.md index 662b366..5bf1e79 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,33 @@ # Bright Data Python SDK Changelog +## Version 2.2.1 - 100 Datasets API + +### ✨ New Features + +#### Expanded Datasets Coverage +Added 92 new dataset integrations, bringing the total to **100 datasets**: + +- **Luxury Brands**: Loewe, Berluti, Moynat, Hermes, Delvaux, Prada, Montblanc, YSL, Dior, Balenciaga, Bottega Veneta, Celine, Chanel, Fendi +- **E-commerce**: Amazon (Reviews, Sellers), Walmart, Shopee, Lazada, Zalando, Sephora, Zara, Mango, Massimo Dutti, Asos, Shein, Ikea, H&M, Lego, Mouser, Digikey +- **Social Media**: Instagram (Profiles, Posts), TikTok, Pinterest (Posts, Profiles), YouTube (Profiles, Videos, Comments), Facebook Pages Posts +- **Real Estate**: Zillow, Airbnb, Australia Real Estate, Otodom Poland, Zonaprop Argentina, Metrocuadrado, Infocasas Uruguay, Properati, Toctoc, Inmuebles24 Mexico, Yapo Chile +- **Business Data**: Glassdoor (Companies, Reviews, Jobs), Indeed (Companies, Jobs), ZoomInfo, PitchBook, G2, Trustpilot, TrustRadius, Owler, Slintel, Manta, VentureRadar, Companies Enriched, Employees Enriched +- **Other**: World Zipcodes, US Lawyers, Google Maps Reviews, Yelp, Xing Profiles, OLX Brazil, Webmotors Brasil, Chileautos, LinkedIn Jobs + +#### SERP Pagination Support +Added sequential querying to retrieve more than 10 search results from Google: + +```python +async with BrightDataClient() as client: + # Get up to 50 results with automatic pagination + results = await client.search.google( + query="python programming", + num_results=50 # Fetches multiple pages sequentially + ) +``` + +--- + ## Version 2.2.0 - Datasets API ### ✨ New Features diff --git a/notebooks/datasets/mass_test.ipynb b/notebooks/datasets/mass_test.ipynb new file mode 100644 index 0000000..a37f8be --- /dev/null +++ b/notebooks/datasets/mass_test.ipynb @@ -0,0 +1,2567 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Datasets Mass Test\n", + "\n", + "Tests all 87 datasets (excluding Amazon, LinkedIn, Crunchbase, Instagram, TikTok, YouTube).\n", + "Each dataset has 2 cells:\n", + "1. **Trigger** - Creates a snapshot using `sample()` and returns snapshot_id\n", + "2. **Download** - Fetches data when snapshot is ready\n", + "\n", + "Each dataset uses unique variable names so cells can be run in any order." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 1, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from brightdata import BrightDataClient\n", + "\n", + "client = BrightDataClient()\n", + "await client.__aenter__()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Entertainment & Media" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IMDB Movies snapshot: snap_mlowtcf5t08eg7smu\n" + ] + } + ], + "source": [ + "# IMDB Movies - Trigger\n", + "imdb_movies_snapshot = await client.datasets.imdb_movies.sample(records_limit=2)\n", + "print(f\"IMDB Movies snapshot: {imdb_movies_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "IMDB Movies: 2 records\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'awards': None,\n", + " 'boxoffice_budget': None,\n", + " 'comment': None,\n", + " 'credit': None,\n", + " 'critics_review_count': 0,\n", + " 'details_also_known_as': None,\n", + " 'details_countries_of_origin': None,\n", + " 'details_filming_locations': None,\n", + " 'details_language': None,\n", + " 'details_official_site': None,\n", + " 'details_production_companies': None,\n", + " 'details_release_date': '9/18/2011 (Taiwan)',\n", + " 'featured_review': None,\n", + " 'genres': ['Drama', 'Romance'],\n", + " 'id': 'tt35681192',\n", + " 'imdb_rating': None,\n", + " 'imdb_rating_count': 0,\n", + " 'media_type': 'Feature Film',\n", + " 'photos': None,\n", + " 'popularity': None,\n", + " 'poster_url': 'https://www.imdb.com/title/tt35681192/mediaviewer/rm2686300162',\n", + " 'presentation': None,\n", + " 'review_count': 0,\n", + " 'review_rating': None,\n", + " 'specs_aspect_ratio': None,\n", + " 'specs_color': None,\n", + " 'specs_sound_mix': None,\n", + " 'storyline': None,\n", + " 'title': '#1.16',\n", + " 'top_cast': None,\n", + " 'url': 'https://www.imdb.com/title/tt35681192/',\n", + " 'videos': None},\n", + " {'awards': 'Awards, 1 nomination total',\n", + " 'boxoffice_budget': '$180,000.00',\n", + " 'comment': None,\n", + " 'credit': [{'names': [{'link': 'https://www.imdb.com/name/nm0001129/',\n", + " 'name': 'Jonathan Demme'}],\n", + " 'title': 'Director'},\n", + " {'names': [{'link': 'https://www.imdb.com/name/nm0001129/',\n", + " 'name': 'Jonathan Demme'}],\n", + " 'title': 'Writer'},\n", + " {'names': [{'link': 'https://www.imdb.com/name/nm0113921/',\n", + " 'name': 'Juanita Brown'},\n", + " {'link': 'https://www.imdb.com/name/nm0310532/', 'name': 'Erica Gavin'},\n", + " {'link': 'https://www.imdb.com/name/nm0172646/',\n", + " 'name': 'Roberta Collins'}],\n", + " 'title': 'Stars'}],\n", + " 'critics_review_count': 54,\n", + " 'details_also_known_as': 'Jaula caliente',\n", + " 'details_countries_of_origin': 'United States',\n", + " 'details_filming_locations': 'Lincoln Heights Jail - 401 N. Avenue 19, Lincoln Heights, Los Angeles, California, USA',\n", + " 'details_language': 'English',\n", + " 'details_official_site': None,\n", + " 'details_production_companies': 'Renegade Women Company, Artists Entertainment Complex',\n", + " 'details_release_date': '6/14/1975 (Japan)',\n", + " 'featured_review': 'Arguably the finest women in prison (WIP) film ever made, CAGED HEAT proves that even a trash exploitation film can aspire to decent artistic values. Jackie (Erica Gavin), an accomplice in a drug related crime, is sent to a southern penitentiary run by an oppressive, wheelchair-bound warden (Barbara Steele). Jackie's cell mate Lavelle (Cheryl Rainbeuax Smith) suffers from suicidal nightmares while another prisoner, Pandora (Ella Reid), is reprimanded for entertaining her fellow inmates with a mildly lewd vaudeville act and placed in solitary confinement. Her loyal friend Belle (Roberta Collins) begins sneaking through the ventilation ducts to bring her food from the kitchen until she's caught when she surprises an elderly staff member who abruptly dies of a heart attack. Meanwhile, the prison bully Maggie (Juanita Brown) picks a fight with Jackie and gets them both in hot water. Though the warden is a bit stern, the real threat turns out to be the demented prison doctor (Warren Miller). He subjects Jackie and Maggie to illegal electric shock therapy and prescribes a more permanent `cure' for Belle: corrective brain surgery, which he intends to perform with a Black and Decker power drill (!). Jackie and Maggie finally work out their differences and manage to escape in a highjacked prison truck. But Jackie can't bring herself to abandon Lavelle, Pandora, and especially the doomed Belle. With Maggie's help, she plans a daring prison break to rescue her friends.Jonathan Demme's script provides believable characters and several imaginative dream sequences, and his direction is filled with impressive camera angles and novel wipes and dissolves. He even commissioned an appropriately down and dirty soundtrack from blues legend John Cale. Because of these frequent artistic flourishes, CAGED HEAT is one of the few WIP movies to win the respect of critics. In spite of the abundant exploitation and nudity, the film unexpectedly also won the approval of some feminist groups who praised its positive depiction of `Woman Power.'\\nA hugely appealing cast helps the movie immeasurably. Ms. Steele earned a reputation as the original `Scream Queen' with her edgy performances in horror classics like Mario Bava's BLACK SUNDAY and Roger Corman's THE PIT AND THE PENDULUM (both 1961). She's cast largely against type here as the prudish warden, but a dream sequence in which she performs a raucous Vegas style dance number wearing glittering tights and sheer stockings reveals her character's repressed eroticism, a quality Steele projected in all her roles. Leading lady Ms. Gavin made her screen debut several years earlier in one of the first hardcore adult features, Russ Meyer's VIXEN! (1968), which was a gutsy career move in an era when many actors were arrested for performing sex acts on film, then still a punishable crime. The petite Ms. Smith enjoyed a busy career in exploitation films during the '70s and early '80s; she tragically died of hepatitis in 2002. But beautiful blue-eyed Ms. Collins, who had already appeared in two previous WIP movies (THE BIG DOLL HOUSE and WOMEN IN CAGES, both made in 1971), steals the show as the endearingly faithful Belle. The character takes considerable personal risk to help her friend Pandora and ultimately suffers for her effort. When we see her molested by the perverted doctor and learn that she's scheduled to become his next lobotomy victim, the news is genuinely shocking and upsetting, which nicely sets up Jackie and Maggie's race against the clock to save her. In other words, Belle ultimately becomes the emotional focus of the entire plot, and Ms. Collins handles the pivotal role with winning charisma and grace. She went on to appear in countless more cult B movies, including a fourth WIP film, VENDETTA (1986).Demme of course went on to even bigger and better things, becoming one of the most successful directors of his generation. He won a Best Director Academy Award in 1991 for THE SILENCE OF THE LAMBS, which also won the Best Picture Oscar.',\n", + " 'genres': ['Action', 'Comedy', 'Drama'],\n", + " 'id': 'tt0071266',\n", + " 'imdb_rating': 5.3,\n", + " 'imdb_rating_count': 3423,\n", + " 'media_type': 'Feature Film',\n", + " 'photos': [{'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm3823400449',\n", + " 'name': 'Erica Gavin in Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm309268737',\n", + " 'name': 'Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm342823169',\n", + " 'name': 'Barbara Steele in Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm359600385',\n", + " 'name': 'Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm175051009',\n", + " 'name': 'Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm74387713',\n", + " 'name': 'Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm1047466241',\n", + " 'name': 'Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm812585217',\n", + " 'name': 'Caged Heat (1974)'},\n", + " {'link': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm879694081',\n", + " 'name': 'Caged Heat (1974)'}],\n", + " 'popularity': None,\n", + " 'poster_url': 'https://www.imdb.com/title/tt0071266/mediaviewer/rm4077074944',\n", + " 'presentation': \"In a women's prison, a group of inmates band together to combat the repressive and abusive policies of the crippled female warden and the corrupt prison doctor.\",\n", + " 'review_count': 45,\n", + " 'review_rating': None,\n", + " 'specs_aspect_ratio': '1.85 : 1',\n", + " 'specs_color': 'Color',\n", + " 'specs_sound_mix': 'Mono',\n", + " 'storyline': 'A girl is caught in a drug bust and sent to the hoosegow. The iron-handed female superintendent takes exception to a skit performed by the girls and takes punitive steps, aided by the sadistic doctor who is doing illegal electroshock experiments and raping drugged prisoners. After a while, the prisoners put away their petty differences and plan the Big Prison Escape. —Ed Sutton ',\n", + " 'title': 'Caged Heat',\n", + " 'top_cast': [{'actor': 'Juanita Brown',\n", + " 'character': 'Maggie',\n", + " 'link': 'https://www.imdb.com/name/nm0113921'},\n", + " {'actor': 'Erica Gavin',\n", + " 'character': 'Jacqueline Wilson',\n", + " 'link': 'https://www.imdb.com/name/nm0310532'},\n", + " {'actor': 'Roberta Collins',\n", + " 'character': 'Belle Tyson',\n", + " 'link': 'https://www.imdb.com/name/nm0172646'},\n", + " {'actor': 'Ella Reid',\n", + " 'character': 'Pandora',\n", + " 'link': 'https://www.imdb.com/name/nm0717254'},\n", + " {'actor': 'Cheryl Smith',\n", + " 'character': 'Lavelle',\n", + " 'link': 'https://www.imdb.com/name/nm0807679'},\n", + " {'actor': 'Barbara Steele',\n", + " 'character': 'Superintendent McQueen',\n", + " 'link': 'https://www.imdb.com/name/nm0824489'},\n", + " {'actor': 'Warren Miller',\n", + " 'character': 'Dr. Randolph',\n", + " 'link': 'https://www.imdb.com/name/nm0589497'},\n", + " {'actor': 'Crystin Sinclaire',\n", + " 'character': 'Crazy Alice',\n", + " 'link': 'https://www.imdb.com/name/nm0325030'},\n", + " {'actor': 'Toby Carr Rafelson',\n", + " 'character': 'Pinter',\n", + " 'link': 'https://www.imdb.com/name/nm0706186'},\n", + " {'actor': 'Mickey Fox',\n", + " 'character': 'Bernice',\n", + " 'link': 'https://www.imdb.com/name/nm0289157'},\n", + " {'actor': 'Dorothy Love',\n", + " 'character': 'Kitchen Matron',\n", + " 'link': 'https://www.imdb.com/name/nm0522314'},\n", + " {'actor': 'Irene Stokes',\n", + " 'character': 'Hazel',\n", + " 'link': 'https://www.imdb.com/name/nm0831347'},\n", + " {'actor': 'Cynthia Songé',\n", + " 'character': 'Rosemary',\n", + " 'link': 'https://www.imdb.com/name/nm0814329'},\n", + " {'actor': 'Carol Terry',\n", + " 'character': 'Kitchen Guard',\n", + " 'link': 'https://www.imdb.com/name/nm0855957'},\n", + " {'actor': 'Layla Bias Galloway',\n", + " 'character': 'Shower Guard',\n", + " 'link': 'https://www.imdb.com/name/nm0303190'},\n", + " {'actor': 'Ann Stockdale',\n", + " 'character': 'Bonnie',\n", + " 'link': 'https://www.imdb.com/name/nm0830768'},\n", + " {'actor': 'Essie Hayes',\n", + " 'character': 'Essie',\n", + " 'link': 'https://www.imdb.com/name/nm0371009'},\n", + " {'actor': 'John Aprea',\n", + " 'character': 'Dream Man',\n", + " 'link': 'https://www.imdb.com/name/nm0032501'}],\n", + " 'url': 'https://www.imdb.com/title/tt0071266/',\n", + " 'videos': None}]" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# IMDB Movies - Download\n", + "imdb_movies_data = await client.datasets.imdb_movies.download(imdb_movies_snapshot)\n", + "print(f\"IMDB Movies: {len(imdb_movies_data)} records\")\n", + "imdb_movies_data" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NBA Players Stats snapshot: snap_mlowtqzptfxvaj2e5\n" + ] + } + ], + "source": [ + "# NBA Players Stats - Trigger\n", + "nba_players_snapshot = await client.datasets.nba_players_stats.sample(records_limit=2)\n", + "print(f\"NBA Players Stats snapshot: {nba_players_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "NBA Players Stats: 2 records\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'player_assist_to_turnover_ratio': 1.8,\n", + " 'player_assists_per_game': 2.3,\n", + " 'player_blocks_per_game': 0.3,\n", + " 'player_defensive_rebounds_per_game': 1.9,\n", + " 'player_fouls_per_game': 2.3,\n", + " 'player_games_played': 44,\n", + " 'player_games_started': 44,\n", + " 'player_minutes_per_game': 29.8,\n", + " 'player_name': 'Wesley Matthews',\n", + " 'player_offensive_rebounds__per_game': 0.5,\n", + " 'player_points_per_game': 13.1,\n", + " 'player_rebounds_per_game': 2.3,\n", + " 'player_steals_per_game': 0.8,\n", + " 'player_turnovers_per_game': 1.3,\n", + " 'season_type': 'Regular season',\n", + " 'season_year': '2018-19',\n", + " 'team': 'DAL',\n", + " 'url': 'https://www.espn.com/nba/player/_/id/4032/wesley-matthews'},\n", + " {'player_assist_to_turnover_ratio': 1,\n", + " 'player_assists_per_game': 0.3,\n", + " 'player_blocks_per_game': 0.1,\n", + " 'player_defensive_rebounds_per_game': 0.5,\n", + " 'player_fouls_per_game': 0.6,\n", + " 'player_games_played': 35,\n", + " 'player_games_started': 1,\n", + " 'player_minutes_per_game': 7.1,\n", + " 'player_name': 'Jake Layman',\n", + " 'player_offensive_rebounds__per_game': 0.2,\n", + " 'player_points_per_game': 2.2,\n", + " 'player_rebounds_per_game': 0.7,\n", + " 'player_steals_per_game': 0.2,\n", + " 'player_turnovers_per_game': 0.3,\n", + " 'season_type': 'Not clear',\n", + " 'season_year': '2016-17',\n", + " 'team': 'POR',\n", + " 'url': 'https://www.espn.com/nba/player/_/id/2982268/jake-layman?year=2020-21&team=MIN'}]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# NBA Players Stats - Download\n", + "nba_players_data = await client.datasets.nba_players_stats.download(nba_players_snapshot)\n", + "print(f\"NBA Players Stats: {len(nba_players_data)} records\")\n", + "nba_players_data" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Goodreads Books snapshot: snap_mlowu0lx1wf6w62rfb\n" + ] + } + ], + "source": [ + "# Goodreads Books - Trigger\n", + "goodreads_snapshot = await client.datasets.goodreads_books.sample(records_limit=2)\n", + "print(f\"Goodreads Books snapshot: {goodreads_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Goodreads Books: 2 records\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'about_author': {'name': 'Peter Towers', 'num_books': 6},\n", + " 'author': ['Peter Towers'],\n", + " 'community_reviews': {'1_stars': {'reviews_num': 0, 'reviews_percentage': 0},\n", + " '2_stars': {'reviews_num': 1, 'reviews_percentage': 100},\n", + " '3_stars': {'reviews_num': 0, 'reviews_percentage': 0},\n", + " '4_stars': {'reviews_num': 0, 'reviews_percentage': 0},\n", + " '5_stars': {'reviews_num': 0, 'reviews_percentage': 0}},\n", + " 'first_published': '12/9/2015',\n", + " 'genres': None,\n", + " 'id': '28945894-build-value-with-virtual-chief-financial-officer-services',\n", + " 'isbn': None,\n", + " 'kindle_price': '$2.99',\n", + " 'name': 'Build Value With Virtual Chief Financial Officer Services: Accountants: Provide services your SME clients want',\n", + " 'num_ratings': 1,\n", + " 'num_reviews': None,\n", + " 'star_rating': 2,\n", + " 'summary': \"Many businesses have a “situation available” for a wider range of services including a virtual CFO – but not many accountants are applying. What other services could you offer to supplement the particular services that have been identified by SMEs? This book walks you through the 'why' and 'how' to start offering business advisory services and brainstorms with you some of the services that you could include in your accountancy business' product offerings.\",\n", + " 'url': 'https://www.goodreads.com/book/show/28945894-build-value-with-virtual-chief-financial-officer-services'},\n", + " {'about_author': {'name': 'Patricia Dusenbury',\n", + " 'num_books': 6,\n", + " 'num_followers': '8'},\n", + " 'author': ['Patricia Dusenbury'],\n", + " 'community_reviews': {'1_stars': {'reviews_num': 1,\n", + " 'reviews_percentage': 10},\n", + " '2_stars': {'reviews_num': 0, 'reviews_percentage': 0},\n", + " '3_stars': {'reviews_num': 1, 'reviews_percentage': 10},\n", + " '4_stars': {'reviews_num': 3, 'reviews_percentage': 30},\n", + " '5_stars': {'reviews_num': 5, 'reviews_percentage': 50}},\n", + " 'first_published': '10/30/2015',\n", + " 'genres': None,\n", + " 'id': '27847000-a-house-of-her-own',\n", + " 'isbn': '9780692562123',\n", + " 'kindle_price': '$3.99',\n", + " 'name': 'A House of Her Own: A Claire Marshall Novel',\n", + " 'num_ratings': 10,\n", + " 'num_reviews': '5',\n", + " 'star_rating': 4.1,\n", + " 'summary': \"If something seems to good to be true .... That house in that neighborhood for that price? Claire Marshall thought she'd hit the jackpot. Her company would restore the old house to its previous glory and sell it at a huge profit. She hadn't counted on the runaway girl hiding in the upstairs bedroom, the brutal gang chasing her, or the angry ghost who may or may not be keeping the gang at bay. Claire's business partner says they won't be able to give this house away. He wants to write off their investment and walk away, her workers don't want to go inside, but Claire doesn't believe in ghosts or in giving up without a fight. And she could use a distraction. Tony, her lover, is back on the Grand Prix circuit, driving in a racing season marred by fatal accidents, and, according the media, finding comfort in the arms of another woman. Just when Claire thinks that things can't get worse...\",\n", + " 'url': 'https://www.goodreads.com/book/show/27847000-a-house-of-her-own'}]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Goodreads Books - Download\n", + "goodreads_data = await client.datasets.goodreads_books.download(goodreads_snapshot)\n", + "print(f\"Goodreads Books: {len(goodreads_data)} records\")\n", + "goodreads_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reference Data" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "World Population snapshot: snap_mlowueht1losatuxva\n" + ] + } + ], + "source": [ + "# World Population - Trigger\n", + "world_pop_snapshot = await client.datasets.world_population.sample(records_limit=2)\n", + "print(f\"World Population snapshot: {world_pop_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "World Population: 2 records\n" + ] + }, + { + "data": { + "text/plain": [ + "[{'abbreviation': 'BHS',\n", + " 'annual_population_growth': '1.6K 0.41%',\n", + " 'births_per_day': 12,\n", + " 'capital': 'Nassau',\n", + " 'continent': 'North America',\n", + " 'country': 'Bahamas',\n", + " 'country_area': None,\n", + " 'country_density (/km²)': None,\n", + " 'country_land_area': None,\n", + " 'country_population_rank': 178,\n", + " 'deaths_per_day': 10,\n", + " 'emigrations_per_day': None,\n", + " 'flag_image': 'https://worldpopulationreview.com/images/country-flags/png100/bs.png',\n", + " 'larget_cities': ['Nassau', 'Lucaya', 'Freeport'],\n", + " 'last_year_population': None,\n", + " 'net_change_per_day': 4,\n", + " 'population_by_year': None,\n", + " 'population_change': '172',\n", + " 'population_world_percentage': None,\n", + " 'regions': ['Caribbean'],\n", + " 'url': 'https://worldpopulationreview.com/countries/bahamas'},\n", + " {'abbreviation': 'CRI',\n", + " 'annual_population_growth': '21.8K 0.45%',\n", + " 'births_per_day': 137,\n", + " 'capital': 'San José',\n", + " 'continent': 'North America',\n", + " 'country': 'Costa-rica',\n", + " 'country_area': None,\n", + " 'country_density (/km²)': None,\n", + " 'country_land_area': None,\n", + " 'country_population_rank': 127,\n", + " 'deaths_per_day': 82,\n", + " 'emigrations_per_day': None,\n", + " 'flag_image': 'https://worldpopulationreview.com/images/country-flags/png100/cr.png',\n", + " 'larget_cities': ['San Jose', 'Alajuela', 'Heredia'],\n", + " 'last_year_population': None,\n", + " 'net_change_per_day': 58,\n", + " 'population_by_year': None,\n", + " 'population_change': '2494',\n", + " 'population_world_percentage': None,\n", + " 'regions': ['Central America', 'Latin America'],\n", + " 'url': 'https://worldpopulationreview.com/countries/costa-rica'}]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# World Population - Download\n", + "world_pop_data = await client.datasets.world_population.download(world_pop_snapshot)\n", + "print(f\"World Population: {len(world_pop_data)} records\")\n", + "world_pop_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# World Zipcodes - Trigger\n", + "world_zip_snapshot = await client.datasets.world_zipcodes.sample(records_limit=2)\n", + "print(f\"World Zipcodes snapshot: {world_zip_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# World Zipcodes - Download\n", + "world_zip_data = await client.datasets.world_zipcodes.download(world_zip_snapshot)\n", + "print(f\"World Zipcodes: {len(world_zip_data)} records\")\n", + "world_zip_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Business Intelligence" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Companies Enriched snapshot: snap_mlowuqjv283bi7hfob\n" + ] + } + ], + "source": [ + "# Companies Enriched - Trigger\n", + "companies_enriched_snapshot = await client.datasets.companies_enriched.sample(records_limit=2)\n", + "print(f\"Companies Enriched snapshot: {companies_enriched_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "ename": "TimeoutError", + "evalue": "Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)", + "output_type": "error", + "traceback": [ + "\u001b[31m---------------------------------------------------------------------------\u001b[39m", + "\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)", + "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Companies Enriched - Download\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m companies_enriched_data = \u001b[38;5;28;01mawait\u001b[39;00m client.datasets.companies_enriched.download(companies_enriched_snapshot)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCompanies Enriched: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(companies_enriched_data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m records\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 4\u001b[39m companies_enriched_data\n", + "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/sdk-python/src/brightdata/datasets/base.py:185\u001b[39m, in \u001b[36mBaseDataset.download\u001b[39m\u001b[34m(self, snapshot_id, format, timeout, poll_interval)\u001b[39m\n\u001b[32m 183\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetError(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.error\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 184\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m time.time() - start_time > timeout:\n\u001b[32m--> \u001b[39m\u001b[32m185\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\n\u001b[32m 186\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msnapshot_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not ready after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33ms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 187\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m(status: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.status\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 188\u001b[39m )\n\u001b[32m 190\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(poll_interval)\n\u001b[32m 192\u001b[39m \u001b[38;5;66;03m# Download data\u001b[39;00m\n", + "\u001b[31mTimeoutError\u001b[39m: Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)" + ] + } + ], + "source": [ + "# Companies Enriched - Download\n", + "companies_enriched_data = await client.datasets.companies_enriched.download(companies_enriched_snapshot)\n", + "print(f\"Companies Enriched: {len(companies_enriched_data)} records\")\n", + "companies_enriched_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Employees Enriched - Trigger\n", + "employees_enriched_snapshot = await client.datasets.employees_enriched.sample(records_limit=2)\n", + "print(f\"Employees Enriched snapshot: {employees_enriched_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Employees Enriched - Download\n", + "employees_enriched_data = await client.datasets.employees_enriched.download(employees_enriched_snapshot)\n", + "print(f\"Employees Enriched: {len(employees_enriched_data)} records\")\n", + "employees_enriched_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ZoomInfo Companies - Trigger\n", + "zoominfo_snapshot = await client.datasets.zoominfo_companies.sample(records_limit=2)\n", + "print(f\"ZoomInfo Companies snapshot: {zoominfo_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# ZoomInfo Companies - Download\n", + "zoominfo_data = await client.datasets.zoominfo_companies.download(zoominfo_snapshot)\n", + "print(f\"ZoomInfo Companies: {len(zoominfo_data)} records\")\n", + "zoominfo_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# PitchBook Companies - Trigger\n", + "pitchbook_snapshot = await client.datasets.pitchbook_companies.sample(records_limit=2)\n", + "print(f\"PitchBook Companies snapshot: {pitchbook_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# PitchBook Companies - Download\n", + "pitchbook_data = await client.datasets.pitchbook_companies.download(pitchbook_snapshot)\n", + "print(f\"PitchBook Companies: {len(pitchbook_data)} records\")\n", + "pitchbook_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Slintel Companies - Trigger\n", + "slintel_snapshot = await client.datasets.slintel_companies.sample(records_limit=2)\n", + "print(f\"Slintel Companies snapshot: {slintel_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Slintel Companies - Download\n", + "slintel_data = await client.datasets.slintel_companies.download(slintel_snapshot)\n", + "print(f\"Slintel Companies: {len(slintel_data)} records\")\n", + "slintel_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Owler Companies - Trigger\n", + "owler_snapshot = await client.datasets.owler_companies.sample(records_limit=2)\n", + "print(f\"Owler Companies snapshot: {owler_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Owler Companies - Download\n", + "owler_data = await client.datasets.owler_companies.download(owler_snapshot)\n", + "print(f\"Owler Companies: {len(owler_data)} records\")\n", + "owler_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# VentureRadar Companies - Trigger\n", + "ventureradar_snapshot = await client.datasets.ventureradar_companies.sample(records_limit=2)\n", + "print(f\"VentureRadar Companies snapshot: {ventureradar_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# VentureRadar Companies - Download\n", + "ventureradar_data = await client.datasets.ventureradar_companies.download(ventureradar_snapshot)\n", + "print(f\"VentureRadar Companies: {len(ventureradar_data)} records\")\n", + "ventureradar_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Manta Businesses - Trigger\n", + "manta_snapshot = await client.datasets.manta_businesses.sample(records_limit=2)\n", + "print(f\"Manta Businesses snapshot: {manta_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Manta Businesses - Download\n", + "manta_data = await client.datasets.manta_businesses.download(manta_snapshot)\n", + "print(f\"Manta Businesses: {len(manta_data)} records\")\n", + "manta_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Job & HR Platforms" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Glassdoor Companies - Trigger\n", + "glassdoor_companies_snapshot = await client.datasets.glassdoor_companies.sample(records_limit=2)\n", + "print(f\"Glassdoor Companies snapshot: {glassdoor_companies_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Glassdoor Companies - Download\n", + "glassdoor_companies_data = await client.datasets.glassdoor_companies.download(glassdoor_companies_snapshot)\n", + "print(f\"Glassdoor Companies: {len(glassdoor_companies_data)} records\")\n", + "glassdoor_companies_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Glassdoor Reviews - Trigger\n", + "glassdoor_reviews_snapshot = await client.datasets.glassdoor_reviews.sample(records_limit=2)\n", + "print(f\"Glassdoor Reviews snapshot: {glassdoor_reviews_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Glassdoor Reviews - Download\n", + "glassdoor_reviews_data = await client.datasets.glassdoor_reviews.download(glassdoor_reviews_snapshot)\n", + "print(f\"Glassdoor Reviews: {len(glassdoor_reviews_data)} records\")\n", + "glassdoor_reviews_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Glassdoor Jobs - Trigger\n", + "glassdoor_jobs_snapshot = await client.datasets.glassdoor_jobs.sample(records_limit=2)\n", + "print(f\"Glassdoor Jobs snapshot: {glassdoor_jobs_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Glassdoor Jobs - Download\n", + "glassdoor_jobs_data = await client.datasets.glassdoor_jobs.download(glassdoor_jobs_snapshot)\n", + "print(f\"Glassdoor Jobs: {len(glassdoor_jobs_data)} records\")\n", + "glassdoor_jobs_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Indeed Companies - Trigger\n", + "indeed_companies_snapshot = await client.datasets.indeed_companies.sample(records_limit=2)\n", + "print(f\"Indeed Companies snapshot: {indeed_companies_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Indeed Companies - Download\n", + "indeed_companies_data = await client.datasets.indeed_companies.download(indeed_companies_snapshot)\n", + "print(f\"Indeed Companies: {len(indeed_companies_data)} records\")\n", + "indeed_companies_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Indeed Jobs - Trigger\n", + "indeed_jobs_snapshot = await client.datasets.indeed_jobs.sample(records_limit=2)\n", + "print(f\"Indeed Jobs snapshot: {indeed_jobs_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Indeed Jobs - Download\n", + "indeed_jobs_data = await client.datasets.indeed_jobs.download(indeed_jobs_snapshot)\n", + "print(f\"Indeed Jobs: {len(indeed_jobs_data)} records\")\n", + "indeed_jobs_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Xing Profiles - Trigger\n", + "xing_snapshot = await client.datasets.xing_profiles.sample(records_limit=2)\n", + "print(f\"Xing Profiles snapshot: {xing_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Xing Profiles - Download\n", + "xing_data = await client.datasets.xing_profiles.download(xing_snapshot)\n", + "print(f\"Xing Profiles: {len(xing_data)} records\")\n", + "xing_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Reviews & Ratings" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Google Maps Reviews - Trigger\n", + "google_maps_snapshot = await client.datasets.google_maps_reviews.sample(records_limit=2)\n", + "print(f\"Google Maps Reviews snapshot: {google_maps_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Google Maps Reviews - Download\n", + "google_maps_data = await client.datasets.google_maps_reviews.download(google_maps_snapshot)\n", + "print(f\"Google Maps Reviews: {len(google_maps_data)} records\")\n", + "google_maps_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Yelp Businesses - Trigger\n", + "yelp_businesses_snapshot = await client.datasets.yelp_businesses.sample(records_limit=2)\n", + "print(f\"Yelp Businesses snapshot: {yelp_businesses_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Yelp Businesses - Download\n", + "yelp_businesses_data = await client.datasets.yelp_businesses.download(yelp_businesses_snapshot)\n", + "print(f\"Yelp Businesses: {len(yelp_businesses_data)} records\")\n", + "yelp_businesses_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Yelp Reviews - Trigger\n", + "yelp_reviews_snapshot = await client.datasets.yelp_reviews.sample(records_limit=2)\n", + "print(f\"Yelp Reviews snapshot: {yelp_reviews_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Yelp Reviews - Download\n", + "yelp_reviews_data = await client.datasets.yelp_reviews.download(yelp_reviews_snapshot)\n", + "print(f\"Yelp Reviews: {len(yelp_reviews_data)} records\")\n", + "yelp_reviews_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# G2 Products - Trigger\n", + "g2_products_snapshot = await client.datasets.g2_products.sample(records_limit=2)\n", + "print(f\"G2 Products snapshot: {g2_products_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# G2 Products - Download\n", + "g2_products_data = await client.datasets.g2_products.download(g2_products_snapshot)\n", + "print(f\"G2 Products: {len(g2_products_data)} records\")\n", + "g2_products_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# G2 Reviews - Trigger\n", + "g2_reviews_snapshot = await client.datasets.g2_reviews.sample(records_limit=2)\n", + "print(f\"G2 Reviews snapshot: {g2_reviews_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# G2 Reviews - Download\n", + "g2_reviews_data = await client.datasets.g2_reviews.download(g2_reviews_snapshot)\n", + "print(f\"G2 Reviews: {len(g2_reviews_data)} records\")\n", + "g2_reviews_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Trustpilot Reviews - Trigger\n", + "trustpilot_snapshot = await client.datasets.trustpilot_reviews.sample(records_limit=2)\n", + "print(f\"Trustpilot Reviews snapshot: {trustpilot_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Trustpilot Reviews - Download\n", + "trustpilot_data = await client.datasets.trustpilot_reviews.download(trustpilot_snapshot)\n", + "print(f\"Trustpilot Reviews: {len(trustpilot_data)} records\")\n", + "trustpilot_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TrustRadius Reviews - Trigger\n", + "trustradius_snapshot = await client.datasets.trustradius_reviews.sample(records_limit=2)\n", + "print(f\"TrustRadius Reviews snapshot: {trustradius_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# TrustRadius Reviews - Download\n", + "trustradius_data = await client.datasets.trustradius_reviews.download(trustradius_snapshot)\n", + "print(f\"TrustRadius Reviews: {len(trustradius_data)} records\")\n", + "trustradius_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Professional Services" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# US Lawyers - Trigger\n", + "us_lawyers_snapshot = await client.datasets.us_lawyers.sample(records_limit=2)\n", + "print(f\"US Lawyers snapshot: {us_lawyers_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# US Lawyers - Download\n", + "us_lawyers_data = await client.datasets.us_lawyers.download(us_lawyers_snapshot)\n", + "print(f\"US Lawyers: {len(us_lawyers_data)} records\")\n", + "us_lawyers_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Social Media" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pinterest Posts - Trigger\n", + "pinterest_posts_snapshot = await client.datasets.pinterest_posts.sample(records_limit=2)\n", + "print(f\"Pinterest Posts snapshot: {pinterest_posts_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pinterest Posts - Download\n", + "pinterest_posts_data = await client.datasets.pinterest_posts.download(pinterest_posts_snapshot)\n", + "print(f\"Pinterest Posts: {len(pinterest_posts_data)} records\")\n", + "pinterest_posts_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pinterest Profiles - Trigger\n", + "pinterest_profiles_snapshot = await client.datasets.pinterest_profiles.sample(records_limit=2)\n", + "print(f\"Pinterest Profiles snapshot: {pinterest_profiles_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Pinterest Profiles - Download\n", + "pinterest_profiles_data = await client.datasets.pinterest_profiles.download(pinterest_profiles_snapshot)\n", + "print(f\"Pinterest Profiles: {len(pinterest_profiles_data)} records\")\n", + "pinterest_profiles_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Facebook Pages Posts - Trigger\n", + "facebook_posts_snapshot = await client.datasets.facebook_pages_posts.sample(records_limit=2)\n", + "print(f\"Facebook Pages Posts snapshot: {facebook_posts_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Facebook Pages Posts - Download\n", + "facebook_posts_data = await client.datasets.facebook_pages_posts.download(facebook_posts_snapshot)\n", + "print(f\"Facebook Pages Posts: {len(facebook_posts_data)} records\")\n", + "facebook_posts_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Real Estate" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Australia Real Estate - Trigger\n", + "australia_re_snapshot = await client.datasets.australia_real_estate.sample(records_limit=2)\n", + "print(f\"Australia Real Estate snapshot: {australia_re_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Australia Real Estate - Download\n", + "australia_re_data = await client.datasets.australia_real_estate.download(australia_re_snapshot)\n", + "print(f\"Australia Real Estate: {len(australia_re_data)} records\")\n", + "australia_re_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zillow Properties - Trigger\n", + "zillow_snapshot = await client.datasets.zillow_properties.sample(records_limit=2)\n", + "print(f\"Zillow Properties snapshot: {zillow_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zillow Properties - Download\n", + "zillow_data = await client.datasets.zillow_properties.download(zillow_snapshot)\n", + "print(f\"Zillow Properties: {len(zillow_data)} records\")\n", + "zillow_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Airbnb Properties - Trigger\n", + "airbnb_snapshot = await client.datasets.airbnb_properties.sample(records_limit=2)\n", + "print(f\"Airbnb Properties snapshot: {airbnb_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Airbnb Properties - Download\n", + "airbnb_data = await client.datasets.airbnb_properties.download(airbnb_snapshot)\n", + "print(f\"Airbnb Properties: {len(airbnb_data)} records\")\n", + "airbnb_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Otodom Poland - Trigger\n", + "otodom_snapshot = await client.datasets.otodom_poland.sample(records_limit=2)\n", + "print(f\"Otodom Poland snapshot: {otodom_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Otodom Poland - Download\n", + "otodom_data = await client.datasets.otodom_poland.download(otodom_snapshot)\n", + "print(f\"Otodom Poland: {len(otodom_data)} records\")\n", + "otodom_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zonaprop Argentina - Trigger\n", + "zonaprop_snapshot = await client.datasets.zonaprop_argentina.sample(records_limit=2)\n", + "print(f\"Zonaprop Argentina snapshot: {zonaprop_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zonaprop Argentina - Download\n", + "zonaprop_data = await client.datasets.zonaprop_argentina.download(zonaprop_snapshot)\n", + "print(f\"Zonaprop Argentina: {len(zonaprop_data)} records\")\n", + "zonaprop_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Metrocuadrado Properties - Trigger\n", + "metrocuadrado_snapshot = await client.datasets.metrocuadrado_properties.sample(records_limit=2)\n", + "print(f\"Metrocuadrado Properties snapshot: {metrocuadrado_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Metrocuadrado Properties - Download\n", + "metrocuadrado_data = await client.datasets.metrocuadrado_properties.download(metrocuadrado_snapshot)\n", + "print(f\"Metrocuadrado Properties: {len(metrocuadrado_data)} records\")\n", + "metrocuadrado_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Infocasas Uruguay - Trigger\n", + "infocasas_snapshot = await client.datasets.infocasas_uruguay.sample(records_limit=2)\n", + "print(f\"Infocasas Uruguay snapshot: {infocasas_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Infocasas Uruguay - Download\n", + "infocasas_data = await client.datasets.infocasas_uruguay.download(infocasas_snapshot)\n", + "print(f\"Infocasas Uruguay: {len(infocasas_data)} records\")\n", + "infocasas_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Properati Properties - Trigger\n", + "properati_snapshot = await client.datasets.properati_properties.sample(records_limit=2)\n", + "print(f\"Properati Properties snapshot: {properati_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Properati Properties - Download\n", + "properati_data = await client.datasets.properati_properties.download(properati_snapshot)\n", + "print(f\"Properati Properties: {len(properati_data)} records\")\n", + "properati_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Toctoc Properties - Trigger\n", + "toctoc_snapshot = await client.datasets.toctoc_properties.sample(records_limit=2)\n", + "print(f\"Toctoc Properties snapshot: {toctoc_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Toctoc Properties - Download\n", + "toctoc_data = await client.datasets.toctoc_properties.download(toctoc_snapshot)\n", + "print(f\"Toctoc Properties: {len(toctoc_data)} records\")\n", + "toctoc_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inmuebles24 Mexico - Trigger\n", + "inmuebles24_snapshot = await client.datasets.inmuebles24_mexico.sample(records_limit=2)\n", + "print(f\"Inmuebles24 Mexico snapshot: {inmuebles24_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Inmuebles24 Mexico - Download\n", + "inmuebles24_data = await client.datasets.inmuebles24_mexico.download(inmuebles24_snapshot)\n", + "print(f\"Inmuebles24 Mexico: {len(inmuebles24_data)} records\")\n", + "inmuebles24_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Classifieds & Marketplaces" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# OLX Brazil - Trigger\n", + "olx_snapshot = await client.datasets.olx_brazil.sample(records_limit=2)\n", + "print(f\"OLX Brazil snapshot: {olx_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# OLX Brazil - Download\n", + "olx_data = await client.datasets.olx_brazil.download(olx_snapshot)\n", + "print(f\"OLX Brazil: {len(olx_data)} records\")\n", + "olx_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Yapo Chile - Trigger\n", + "yapo_snapshot = await client.datasets.yapo_chile.sample(records_limit=2)\n", + "print(f\"Yapo Chile snapshot: {yapo_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Yapo Chile - Download\n", + "yapo_data = await client.datasets.yapo_chile.download(yapo_snapshot)\n", + "print(f\"Yapo Chile: {len(yapo_data)} records\")\n", + "yapo_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Webmotors Brasil - Trigger\n", + "webmotors_snapshot = await client.datasets.webmotors_brasil.sample(records_limit=2)\n", + "print(f\"Webmotors Brasil snapshot: {webmotors_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Webmotors Brasil - Download\n", + "webmotors_data = await client.datasets.webmotors_brasil.download(webmotors_snapshot)\n", + "print(f\"Webmotors Brasil: {len(webmotors_data)} records\")\n", + "webmotors_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Chileautos Chile - Trigger\n", + "chileautos_snapshot = await client.datasets.chileautos_chile.sample(records_limit=2)\n", + "print(f\"Chileautos Chile snapshot: {chileautos_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Chileautos Chile - Download\n", + "chileautos_data = await client.datasets.chileautos_chile.download(chileautos_snapshot)\n", + "print(f\"Chileautos Chile: {len(chileautos_data)} records\")\n", + "chileautos_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - General" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Walmart Products - Trigger\n", + "walmart_snapshot = await client.datasets.walmart_products.sample(records_limit=2)\n", + "print(f\"Walmart Products snapshot: {walmart_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Walmart Products - Download\n", + "walmart_data = await client.datasets.walmart_products.download(walmart_snapshot)\n", + "print(f\"Walmart Products: {len(walmart_data)} records\")\n", + "walmart_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Shopee Products - Trigger\n", + "shopee_snapshot = await client.datasets.shopee_products.sample(records_limit=2)\n", + "print(f\"Shopee Products snapshot: {shopee_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Shopee Products - Download\n", + "shopee_data = await client.datasets.shopee_products.download(shopee_snapshot)\n", + "print(f\"Shopee Products: {len(shopee_data)} records\")\n", + "shopee_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lazada Products - Trigger\n", + "lazada_snapshot = await client.datasets.lazada_products.sample(records_limit=2)\n", + "print(f\"Lazada Products snapshot: {lazada_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lazada Products - Download\n", + "lazada_data = await client.datasets.lazada_products.download(lazada_snapshot)\n", + "print(f\"Lazada Products: {len(lazada_data)} records\")\n", + "lazada_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mediamarkt Products - Trigger\n", + "mediamarkt_snapshot = await client.datasets.mediamarkt_products.sample(records_limit=2)\n", + "print(f\"Mediamarkt Products snapshot: {mediamarkt_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mediamarkt Products - Download\n", + "mediamarkt_data = await client.datasets.mediamarkt_products.download(mediamarkt_snapshot)\n", + "print(f\"Mediamarkt Products: {len(mediamarkt_data)} records\")\n", + "mediamarkt_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - Fashion" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zalando Products - Trigger\n", + "zalando_snapshot = await client.datasets.zalando_products.sample(records_limit=2)\n", + "print(f\"Zalando Products snapshot: {zalando_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zalando Products - Download\n", + "zalando_data = await client.datasets.zalando_products.download(zalando_snapshot)\n", + "print(f\"Zalando Products: {len(zalando_data)} records\")\n", + "zalando_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Asos Products - Trigger\n", + "asos_snapshot = await client.datasets.asos_products.sample(records_limit=2)\n", + "print(f\"Asos Products snapshot: {asos_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Asos Products - Download\n", + "asos_data = await client.datasets.asos_products.download(asos_snapshot)\n", + "print(f\"Asos Products: {len(asos_data)} records\")\n", + "asos_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zara Products - Trigger\n", + "zara_snapshot = await client.datasets.zara_products.sample(records_limit=2)\n", + "print(f\"Zara Products snapshot: {zara_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zara Products - Download\n", + "zara_data = await client.datasets.zara_products.download(zara_snapshot)\n", + "print(f\"Zara Products: {len(zara_data)} records\")\n", + "zara_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zara Home Products - Trigger\n", + "zara_home_snapshot = await client.datasets.zara_home_products.sample(records_limit=2)\n", + "print(f\"Zara Home Products snapshot: {zara_home_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Zara Home Products - Download\n", + "zara_home_data = await client.datasets.zara_home_products.download(zara_home_snapshot)\n", + "print(f\"Zara Home Products: {len(zara_home_data)} records\")\n", + "zara_home_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mango Products - Trigger\n", + "mango_snapshot = await client.datasets.mango_products.sample(records_limit=2)\n", + "print(f\"Mango Products snapshot: {mango_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mango Products - Download\n", + "mango_data = await client.datasets.mango_products.download(mango_snapshot)\n", + "print(f\"Mango Products: {len(mango_data)} records\")\n", + "mango_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Massimo Dutti Products - Trigger\n", + "massimo_dutti_snapshot = await client.datasets.massimo_dutti_products.sample(records_limit=2)\n", + "print(f\"Massimo Dutti Products snapshot: {massimo_dutti_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Massimo Dutti Products - Download\n", + "massimo_dutti_data = await client.datasets.massimo_dutti_products.download(massimo_dutti_snapshot)\n", + "print(f\"Massimo Dutti Products: {len(massimo_dutti_data)} records\")\n", + "massimo_dutti_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# H&M Products - Trigger\n", + "hm_snapshot = await client.datasets.hm_products.sample(records_limit=2)\n", + "print(f\"H&M Products snapshot: {hm_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# H&M Products - Download\n", + "hm_data = await client.datasets.hm_products.download(hm_snapshot)\n", + "print(f\"H&M Products: {len(hm_data)} records\")\n", + "hm_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Shein Products - Trigger\n", + "shein_snapshot = await client.datasets.shein_products.sample(records_limit=2)\n", + "print(f\"Shein Products snapshot: {shein_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Shein Products - Download\n", + "shein_data = await client.datasets.shein_products.download(shein_snapshot)\n", + "print(f\"Shein Products: {len(shein_data)} records\")\n", + "shein_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# American Eagle Products - Trigger\n", + "american_eagle_snapshot = await client.datasets.american_eagle_products.sample(records_limit=2)\n", + "print(f\"American Eagle Products snapshot: {american_eagle_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# American Eagle Products - Download\n", + "american_eagle_data = await client.datasets.american_eagle_products.download(american_eagle_snapshot)\n", + "print(f\"American Eagle Products: {len(american_eagle_data)} records\")\n", + "american_eagle_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Carters Products - Trigger\n", + "carters_snapshot = await client.datasets.carters_products.sample(records_limit=2)\n", + "print(f\"Carters Products snapshot: {carters_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Carters Products - Download\n", + "carters_data = await client.datasets.carters_products.download(carters_snapshot)\n", + "print(f\"Carters Products: {len(carters_data)} records\")\n", + "carters_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fanatics Products - Trigger\n", + "fanatics_snapshot = await client.datasets.fanatics_products.sample(records_limit=2)\n", + "print(f\"Fanatics Products snapshot: {fanatics_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fanatics Products - Download\n", + "fanatics_data = await client.datasets.fanatics_products.download(fanatics_snapshot)\n", + "print(f\"Fanatics Products: {len(fanatics_data)} records\")\n", + "fanatics_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - Luxury" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Chanel Products - Trigger\n", + "chanel_snapshot = await client.datasets.chanel_products.sample(records_limit=2)\n", + "print(f\"Chanel Products snapshot: {chanel_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Chanel Products - Download\n", + "chanel_data = await client.datasets.chanel_products.download(chanel_snapshot)\n", + "print(f\"Chanel Products: {len(chanel_data)} records\")\n", + "chanel_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dior Products - Trigger\n", + "dior_snapshot = await client.datasets.dior_products.sample(records_limit=2)\n", + "print(f\"Dior Products snapshot: {dior_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Dior Products - Download\n", + "dior_data = await client.datasets.dior_products.download(dior_snapshot)\n", + "print(f\"Dior Products: {len(dior_data)} records\")\n", + "dior_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fendi Products - Trigger\n", + "fendi_snapshot = await client.datasets.fendi_products.sample(records_limit=2)\n", + "print(f\"Fendi Products snapshot: {fendi_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Fendi Products - Download\n", + "fendi_data = await client.datasets.fendi_products.download(fendi_snapshot)\n", + "print(f\"Fendi Products: {len(fendi_data)} records\")\n", + "fendi_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prada Products - Trigger\n", + "prada_snapshot = await client.datasets.prada_products.sample(records_limit=2)\n", + "print(f\"Prada Products snapshot: {prada_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Prada Products - Download\n", + "prada_data = await client.datasets.prada_products.download(prada_snapshot)\n", + "print(f\"Prada Products: {len(prada_data)} records\")\n", + "prada_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Balenciaga Products - Trigger\n", + "balenciaga_snapshot = await client.datasets.balenciaga_products.sample(records_limit=2)\n", + "print(f\"Balenciaga Products snapshot: {balenciaga_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Balenciaga Products - Download\n", + "balenciaga_data = await client.datasets.balenciaga_products.download(balenciaga_snapshot)\n", + "print(f\"Balenciaga Products: {len(balenciaga_data)} records\")\n", + "balenciaga_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bottega Veneta Products - Trigger\n", + "bottega_snapshot = await client.datasets.bottegaveneta_products.sample(records_limit=2)\n", + "print(f\"Bottega Veneta Products snapshot: {bottega_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Bottega Veneta Products - Download\n", + "bottega_data = await client.datasets.bottegaveneta_products.download(bottega_snapshot)\n", + "print(f\"Bottega Veneta Products: {len(bottega_data)} records\")\n", + "bottega_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Celine Products - Trigger\n", + "celine_snapshot = await client.datasets.celine_products.sample(records_limit=2)\n", + "print(f\"Celine Products snapshot: {celine_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Celine Products - Download\n", + "celine_data = await client.datasets.celine_products.download(celine_snapshot)\n", + "print(f\"Celine Products: {len(celine_data)} records\")\n", + "celine_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loewe Products - Trigger\n", + "loewe_snapshot = await client.datasets.loewe_products.sample(records_limit=2)\n", + "print(f\"Loewe Products snapshot: {loewe_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Loewe Products - Download\n", + "loewe_data = await client.datasets.loewe_products.download(loewe_snapshot)\n", + "print(f\"Loewe Products: {len(loewe_data)} records\")\n", + "loewe_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Berluti Products - Trigger\n", + "berluti_snapshot = await client.datasets.berluti_products.sample(records_limit=2)\n", + "print(f\"Berluti Products snapshot: {berluti_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Berluti Products - Download\n", + "berluti_data = await client.datasets.berluti_products.download(berluti_snapshot)\n", + "print(f\"Berluti Products: {len(berluti_data)} records\")\n", + "berluti_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Moynat Products - Trigger\n", + "moynat_snapshot = await client.datasets.moynat_products.sample(records_limit=2)\n", + "print(f\"Moynat Products snapshot: {moynat_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Moynat Products - Download\n", + "moynat_data = await client.datasets.moynat_products.download(moynat_snapshot)\n", + "print(f\"Moynat Products: {len(moynat_data)} records\")\n", + "moynat_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hermes Products - Trigger\n", + "hermes_snapshot = await client.datasets.hermes_products.sample(records_limit=2)\n", + "print(f\"Hermes Products snapshot: {hermes_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Hermes Products - Download\n", + "hermes_data = await client.datasets.hermes_products.download(hermes_snapshot)\n", + "print(f\"Hermes Products: {len(hermes_data)} records\")\n", + "hermes_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delvaux Products - Trigger\n", + "delvaux_snapshot = await client.datasets.delvaux_products.sample(records_limit=2)\n", + "print(f\"Delvaux Products snapshot: {delvaux_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Delvaux Products - Download\n", + "delvaux_data = await client.datasets.delvaux_products.download(delvaux_snapshot)\n", + "print(f\"Delvaux Products: {len(delvaux_data)} records\")\n", + "delvaux_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Montblanc Products - Trigger\n", + "montblanc_snapshot = await client.datasets.montblanc_products.sample(records_limit=2)\n", + "print(f\"Montblanc Products snapshot: {montblanc_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Montblanc Products - Download\n", + "montblanc_data = await client.datasets.montblanc_products.download(montblanc_snapshot)\n", + "print(f\"Montblanc Products: {len(montblanc_data)} records\")\n", + "montblanc_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YSL Products - Trigger\n", + "ysl_snapshot = await client.datasets.ysl_products.sample(records_limit=2)\n", + "print(f\"YSL Products snapshot: {ysl_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# YSL Products - Download\n", + "ysl_data = await client.datasets.ysl_products.download(ysl_snapshot)\n", + "print(f\"YSL Products: {len(ysl_data)} records\")\n", + "ysl_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - Beauty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sephora Products - Trigger\n", + "sephora_snapshot = await client.datasets.sephora_products.sample(records_limit=2)\n", + "print(f\"Sephora Products snapshot: {sephora_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sephora Products - Download\n", + "sephora_data = await client.datasets.sephora_products.download(sephora_snapshot)\n", + "print(f\"Sephora Products: {len(sephora_data)} records\")\n", + "sephora_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - Home & Furniture" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ikea Products - Trigger\n", + "ikea_snapshot = await client.datasets.ikea_products.sample(records_limit=2)\n", + "print(f\"Ikea Products snapshot: {ikea_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ikea Products - Download\n", + "ikea_data = await client.datasets.ikea_products.download(ikea_snapshot)\n", + "print(f\"Ikea Products: {len(ikea_data)} records\")\n", + "ikea_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ashley Furniture Products - Trigger\n", + "ashley_snapshot = await client.datasets.ashley_furniture_products.sample(records_limit=2)\n", + "print(f\"Ashley Furniture Products snapshot: {ashley_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Ashley Furniture Products - Download\n", + "ashley_data = await client.datasets.ashley_furniture_products.download(ashley_snapshot)\n", + "print(f\"Ashley Furniture Products: {len(ashley_data)} records\")\n", + "ashley_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Crate and Barrel Products - Trigger\n", + "cratebarrel_snapshot = await client.datasets.crateandbarrel_products.sample(records_limit=2)\n", + "print(f\"Crate and Barrel Products snapshot: {cratebarrel_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Crate and Barrel Products - Download\n", + "cratebarrel_data = await client.datasets.crateandbarrel_products.download(cratebarrel_snapshot)\n", + "print(f\"Crate and Barrel Products: {len(cratebarrel_data)} records\")\n", + "cratebarrel_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# La-Z-Boy Products - Trigger\n", + "lazboy_snapshot = await client.datasets.lazboy_products.sample(records_limit=2)\n", + "print(f\"La-Z-Boy Products snapshot: {lazboy_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# La-Z-Boy Products - Download\n", + "lazboy_data = await client.datasets.lazboy_products.download(lazboy_snapshot)\n", + "print(f\"La-Z-Boy Products: {len(lazboy_data)} records\")\n", + "lazboy_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mybobs Products - Trigger\n", + "mybobs_snapshot = await client.datasets.mybobs_products.sample(records_limit=2)\n", + "print(f\"Mybobs Products snapshot: {mybobs_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mybobs Products - Download\n", + "mybobs_data = await client.datasets.mybobs_products.download(mybobs_snapshot)\n", + "print(f\"Mybobs Products: {len(mybobs_data)} records\")\n", + "mybobs_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sleep Number Products - Trigger\n", + "sleepnumber_snapshot = await client.datasets.sleepnumber_products.sample(records_limit=2)\n", + "print(f\"Sleep Number Products snapshot: {sleepnumber_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Sleep Number Products - Download\n", + "sleepnumber_data = await client.datasets.sleepnumber_products.download(sleepnumber_snapshot)\n", + "print(f\"Sleep Number Products: {len(sleepnumber_data)} records\")\n", + "sleepnumber_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Raymour Flanigan Products - Trigger\n", + "raymour_snapshot = await client.datasets.raymourflanigan_products.sample(records_limit=2)\n", + "print(f\"Raymour Flanigan Products snapshot: {raymour_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Raymour Flanigan Products - Download\n", + "raymour_data = await client.datasets.raymourflanigan_products.download(raymour_snapshot)\n", + "print(f\"Raymour Flanigan Products: {len(raymour_data)} records\")\n", + "raymour_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mattressfirm Products - Trigger\n", + "mattressfirm_snapshot = await client.datasets.mattressfirm_products.sample(records_limit=2)\n", + "print(f\"Mattressfirm Products snapshot: {mattressfirm_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mattressfirm Products - Download\n", + "mattressfirm_data = await client.datasets.mattressfirm_products.download(mattressfirm_snapshot)\n", + "print(f\"Mattressfirm Products: {len(mattressfirm_data)} records\")\n", + "mattressfirm_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - Specialty" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lego Products - Trigger\n", + "lego_snapshot = await client.datasets.lego_products.sample(records_limit=2)\n", + "print(f\"Lego Products snapshot: {lego_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Lego Products - Download\n", + "lego_data = await client.datasets.lego_products.download(lego_snapshot)\n", + "print(f\"Lego Products: {len(lego_data)} records\")\n", + "lego_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Toys R Us Products - Trigger\n", + "toysrus_snapshot = await client.datasets.toysrus_products.sample(records_limit=2)\n", + "print(f\"Toys R Us Products snapshot: {toysrus_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Toys R Us Products - Download\n", + "toysrus_data = await client.datasets.toysrus_products.download(toysrus_snapshot)\n", + "print(f\"Toys R Us Products: {len(toysrus_data)} records\")\n", + "toysrus_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# L.L. Bean Products - Trigger\n", + "llbean_snapshot = await client.datasets.llbean_products.sample(records_limit=2)\n", + "print(f\"L.L. Bean Products snapshot: {llbean_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# L.L. Bean Products - Download\n", + "llbean_data = await client.datasets.llbean_products.download(llbean_snapshot)\n", + "print(f\"L.L. Bean Products: {len(llbean_data)} records\")\n", + "llbean_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## E-commerce - Electronics" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mouser Products - Trigger\n", + "mouser_snapshot = await client.datasets.mouser_products.sample(records_limit=2)\n", + "print(f\"Mouser Products snapshot: {mouser_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Mouser Products - Download\n", + "mouser_data = await client.datasets.mouser_products.download(mouser_snapshot)\n", + "print(f\"Mouser Products: {len(mouser_data)} records\")\n", + "mouser_data" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Digikey Products - Trigger\n", + "digikey_snapshot = await client.datasets.digikey_products.sample(records_limit=2)\n", + "print(f\"Digikey Products snapshot: {digikey_snapshot}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Digikey Products - Download\n", + "digikey_data = await client.datasets.digikey_products.download(digikey_snapshot)\n", + "print(f\"Digikey Products: {len(digikey_data)} records\")\n", + "digikey_data" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Cleanup" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "await client.__aexit__(None, None, None)\n", + "print(\"Client closed.\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python", + "version": "3.11.0" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/pyproject.toml b/pyproject.toml index adc068b..5d63337 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -7,7 +7,7 @@ where = ["src"] [project] name = "brightdata-sdk" -version = "2.2.0" +version = "2.2.1" description = "Modern async-first Python SDK for Bright Data APIs" authors = [{name = "Bright Data", email = "support@brightdata.com"}] license = {text = "MIT"} diff --git a/src/brightdata/datasets/base.py b/src/brightdata/datasets/base.py index 0517ce6..7beb7fb 100644 --- a/src/brightdata/datasets/base.py +++ b/src/brightdata/datasets/base.py @@ -100,8 +100,36 @@ async def __call__( json_data=payload, ) as response: data = await response.json() + + if "snapshot_id" not in data: + error_msg = data.get("error") or data.get("message") or str(data) + raise DatasetError(f"Failed to create snapshot: {error_msg}") + return data["snapshot_id"] + async def sample(self, records_limit: int = 10) -> str: + """ + Get a sample of records without specifying a filter. + + Automatically discovers the first available field and uses + 'is_not_null' operator to fetch any available records. + + Args: + records_limit: Maximum number of records to return (default: 10) + + Returns: + snapshot_id (str) - use with download() to get data + """ + metadata = await self.get_metadata() + if not metadata.fields: + raise DatasetError(f"Dataset {self.DATASET_ID} has no fields") + + first_field = list(metadata.fields.keys())[0] + return await self( + filter={"name": first_field, "operator": "is_not_null"}, + records_limit=records_limit, + ) + async def get_status(self, snapshot_id: str) -> SnapshotStatus: """ Check snapshot status. diff --git a/src/brightdata/datasets/companies_enriched/companies.py b/src/brightdata/datasets/companies_enriched/companies.py index a775b27..c6a7bca 100644 --- a/src/brightdata/datasets/companies_enriched/companies.py +++ b/src/brightdata/datasets/companies_enriched/companies.py @@ -68,8 +68,7 @@ class CompaniesEnriched(BaseDataset): >>> data = await companies.download(snapshot_id) """ - # TODO: Replace with actual dataset ID - DATASET_ID = "gd_lxxxxxxxxxxxxxx" # Get from Bright Data console + DATASET_ID = "gd_m3fl0mwzmfpfn4cw4" NAME = "companies_enriched" def __init__(self, engine: "AsyncEngine"): diff --git a/src/brightdata/datasets/employees_enriched/employees.py b/src/brightdata/datasets/employees_enriched/employees.py index c10480a..9ba0da1 100644 --- a/src/brightdata/datasets/employees_enriched/employees.py +++ b/src/brightdata/datasets/employees_enriched/employees.py @@ -109,8 +109,7 @@ class EmployeesEnriched(BaseDataset): >>> data = await employees.download(snapshot_id) """ - # TODO: Replace with actual dataset ID - DATASET_ID = "gd_lxxxxxxxxxxxxxx" # Get from Bright Data console + DATASET_ID = "gd_m18zt6ec11wfqohyrs" NAME = "employees_enriched" def __init__(self, engine: "AsyncEngine"): From 430b1702bfc98c14ef22ddcce0170abe35b643fc Mon Sep 17 00:00:00 2001 From: "user.mail" Date: Mon, 16 Feb 2026 12:29:21 +0300 Subject: [PATCH 5/5] add datasets to README.md --- README.md | 51 +++++++++++++++++++++++++++++- notebooks/datasets/mass_test.ipynb | 27 +++++++++------- 2 files changed, 66 insertions(+), 12 deletions(-) diff --git a/README.md b/README.md index f91b2a2..9c4b366 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # Bright Data Python SDK -The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs. +The official Python SDK for [Bright Data](https://brightdata.com) APIs. Scrape any website, get SERP results, bypass bot detection and CAPTCHAs, and access 100+ ready-made datasets. [![Python](https://img.shields.io/badge/python-3.9%2B-blue)](https://www.python.org/) [![License](https://img.shields.io/badge/license-MIT-blue.svg)](LICENSE) @@ -135,6 +135,55 @@ async with BrightDataClient() as client: - `client.scrape.instagram` - profiles, posts, comments, reels - `client.scrape.facebook` - posts, comments, reels +## Datasets API + +Access 100+ ready-made datasets from Bright Data — pre-collected, structured data from popular platforms. + +```python +async with BrightDataClient() as client: + # Filter a dataset — returns a snapshot_id + snapshot_id = await client.datasets.imdb_movies( + filter={"name": "title", "operator": "includes", "value": "black"}, + records_limit=5 + ) + + # Download when ready (polls until snapshot is complete) + data = await client.datasets.imdb_movies.download(snapshot_id) + print(f"Got {len(data)} records") + + # Quick sample: .sample() auto-discovers fields, no filter needed + # Works on any dataset + snapshot_id = await client.datasets.imdb_movies.sample(records_limit=5) +``` + +**Export results to file:** + +```python +from brightdata.datasets import export + +export(data, "results.json") # JSON +export(data, "results.csv") # CSV +export(data, "results.jsonl") # JSONL +``` + +**Available dataset categories:** +- **E-commerce:** Amazon, Walmart, Shopee, Lazada, Zalando, Zara, H&M, Shein, IKEA, Sephora, and more +- **Business intelligence:** ZoomInfo, PitchBook, Owler, Slintel, VentureRadar, Manta +- **Jobs & HR:** Glassdoor (companies, reviews, jobs), Indeed (companies, jobs), Xing +- **Reviews:** Google Maps, Yelp, G2, Trustpilot, TrustRadius +- **Social media:** Pinterest (posts, profiles), Facebook Pages +- **Real estate:** Zillow, Airbnb, and 8+ regional platforms +- **Luxury brands:** Chanel, Dior, Prada, Balenciaga, Hermes, YSL, and more +- **Entertainment:** IMDB, NBA, Goodreads + +**Discover available fields:** + +```python +metadata = await client.datasets.imdb_movies.get_metadata() +for name, field in metadata.fields.items(): + print(f"{name}: {field.type}") +``` + ## Async Usage Run multiple requests concurrently: diff --git a/notebooks/datasets/mass_test.ipynb b/notebooks/datasets/mass_test.ipynb index a37f8be..013fcfc 100644 --- a/notebooks/datasets/mass_test.ipynb +++ b/notebooks/datasets/mass_test.ipynb @@ -46,7 +46,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [ { @@ -556,20 +556,25 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 16, "metadata": {}, "outputs": [ { - "ename": "TimeoutError", - "evalue": "Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)", - "output_type": "error", - "traceback": [ - "\u001b[31m---------------------------------------------------------------------------\u001b[39m", - "\u001b[31mTimeoutError\u001b[39m Traceback (most recent call last)", - "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[11]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;66;03m# Companies Enriched - Download\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m companies_enriched_data = \u001b[38;5;28;01mawait\u001b[39;00m client.datasets.companies_enriched.download(companies_enriched_snapshot)\n\u001b[32m 3\u001b[39m \u001b[38;5;28mprint\u001b[39m(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mCompanies Enriched: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00m\u001b[38;5;28mlen\u001b[39m(companies_enriched_data)\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m records\u001b[39m\u001b[33m\"\u001b[39m)\n\u001b[32m 4\u001b[39m companies_enriched_data\n", - "\u001b[36mFile \u001b[39m\u001b[32m~/Desktop/projects/sdk-python/src/brightdata/datasets/base.py:185\u001b[39m, in \u001b[36mBaseDataset.download\u001b[39m\u001b[34m(self, snapshot_id, format, timeout, poll_interval)\u001b[39m\n\u001b[32m 183\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m DatasetError(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot failed: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.error\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 184\u001b[39m \u001b[38;5;28;01melif\u001b[39;00m time.time() - start_time > timeout:\n\u001b[32m--> \u001b[39m\u001b[32m185\u001b[39m \u001b[38;5;28;01mraise\u001b[39;00m \u001b[38;5;167;01mTimeoutError\u001b[39;00m(\n\u001b[32m 186\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mSnapshot \u001b[39m\u001b[38;5;132;01m{\u001b[39;00msnapshot_id\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m not ready after \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mtimeout\u001b[38;5;132;01m}\u001b[39;00m\u001b[33ms \u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 187\u001b[39m \u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33m(status: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mstatus.status\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m)\u001b[39m\u001b[33m\"\u001b[39m\n\u001b[32m 188\u001b[39m )\n\u001b[32m 190\u001b[39m \u001b[38;5;28;01mawait\u001b[39;00m asyncio.sleep(poll_interval)\n\u001b[32m 192\u001b[39m \u001b[38;5;66;03m# Download data\u001b[39;00m\n", - "\u001b[31mTimeoutError\u001b[39m: Snapshot snap_mlowuqjv283bi7hfob not ready after 300s (status: building)" + "name": "stdout", + "output_type": "stream", + "text": [ + "Companies Enriched: 1 records\n" ] + }, + { + "data": { + "text/plain": [ + "[{'raw': 'Snapshot is building. Try again in a few minutes'}]" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" } ], "source": [