GMB-Scraper/gmb_scraper.py

611 lines
24 KiB
Python
Raw Normal View History

#!/usr/bin/env python3
"""
Google My Business (Maps) Scraper v4 Production Grade
========================================================
Pain-aware lead generation engine for Darwisyah Digital Media.
Extracts business data, detects pain signals, checks website health,
and generates personalized apex pitches.
Usage:
# Basic scrape (backward compatible)
python3 gmb_scraper.py -q "lawyers Perth CBD" --min-rating 4.0
# Pain-aware scrape (recommended)
python3 gmb_scraper.py -q "dentists Joondalup" --detect-pain --check-websites
# Filter by pain level
python3 gmb_scraper.py -q "accountants Perth" --detect-pain --min-pain 20
# Generate pitch report
python3 gmb_scraper.py -q "lawyers Perth" --detect-pain --pitch-report --channel sms
# Full analysis (reviews + websites + pitches)
python3 gmb_scraper.py -q "dentists Perth" --detect-pain --scrape-reviews --check-websites --pitch-report
Output: CSV with pain scores, signals, and optional pitch drafts.
"""
import argparse
import csv
import json
import os
import sys
import re
import time
import random
import urllib.parse
from pathlib import Path
from datetime import datetime
from dotenv import load_dotenv
from playwright.sync_api import sync_playwright, TimeoutError as PwTimeout
# Load .env file
load_dotenv(Path(__file__).parent / '.env')
# Add lib to path
sys.path.insert(0, str(Path(__file__).parent))
from lib.logger import setup_logger, get_logger, ScraperStats
from lib.retry import retry_with_backoff
from lib.stealth import apply_stealth, create_stealth_context, human_delay, human_scroll_delay
from lib.validator import validate_lead, deduplicate_leads
from lib.pain_detector import detect_pain_signals, calculate_pain_score, format_pain_summary
from lib.review_scraper import scrape_reviews
from lib.health_checker import check_website_health
from lib.pitch_generator import generate_apex_pitch
def parse_args():
p = argparse.ArgumentParser(
description="GMB Scraper v4 — Pain-Aware Lead Generation",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
%(prog)s -q "lawyers Perth CBD" --detect-pain --check-websites
%(prog)s -q "dentists Joondalup" --detect-pain --scrape-reviews --pitch-report
%(prog)s -q "accountants Perth" --detect-pain --min-pain 25 --channel email
"""
)
# Basic options
p.add_argument("--query", "-q", required=True, help="Search query")
p.add_argument("--min-rating", type=float, default=0.0, help="Min star rating (default: 0)")
p.add_argument("--min-reviews", type=int, default=0, help="Min review count (default: 0)")
p.add_argument("--max-results", type=int, default=100, help="Max results (default: 100)")
p.add_argument("--output", "-o", default=None, help="Output CSV path")
p.add_argument("--json", action="store_true", help="Also output JSON")
# Pain detection
p.add_argument("--detect-pain", action="store_true", help="Enable pain signal detection")
p.add_argument("--min-pain", type=int, default=0, help="Min pain score to include (default: 0)")
p.add_argument("--scrape-reviews", action="store_true", help="Scrape reviews for pain keywords")
p.add_argument("--max-reviews", type=int, default=30, help="Max reviews per business (default: 30)")
p.add_argument("--check-websites", action="store_true", help="Check website health (SSL, speed, forms)")
# Pitch generation
p.add_argument("--pitch-report", action="store_true", help="Generate pitch report")
p.add_argument("--channel", default="sms", choices=["sms", "email", "call", "gumtree"],
help="Pitch channel (default: sms)")
# Performance
p.add_argument("--headful", action="store_true", help="Show browser")
p.add_argument("--slow", action="store_true", help="Longer delays (safer)")
p.add_argument("--no-stealth", action="store_true", help="Disable stealth mode (faster)")
p.add_argument("--proxy", default=None, help="Proxy URL (e.g., http://user:pass@host:port)")
return p.parse_args()
@retry_with_backoff(max_attempts=3, base_delay=2.0, retry_on=(PwTimeout, Exception))
def scroll_feed(page, max_results, slow=False):
"""Scroll results feed and collect all place URLs."""
logger = get_logger()
collected = {}
for i in range(300):
items = page.locator('a[href*="/maps/place/"]').all()
new = 0
for item in items:
try:
href = item.get_attribute("href") or ""
aria = item.get_attribute("aria-label") or ""
if href and href not in collected:
collected[href] = aria
new += 1
except Exception:
continue
if len(collected) >= max_results:
break
# Scroll the feed
try:
page.locator('[role="feed"]').first.evaluate("el => el.scrollBy(0, 1000)")
except Exception:
page.keyboard.press("End")
time.sleep(human_scroll_delay())
# If no new items, wait longer then retry
if new == 0:
time.sleep(2)
items2 = page.locator('a[href*="/maps/place/"]').all()
still_new = 0
for item in items2:
try:
href = item.get_attribute("href") or ""
aria = item.get_attribute("aria-label") or ""
if href and href not in collected:
collected[href] = aria
still_new += 1
except Exception:
continue
if still_new == 0:
logger.info(f"Scroll complete: {len(collected)} businesses loaded")
break
if (i + 1) % 10 == 0:
logger.info(f"Scroll {i+1}: {len(collected)} businesses loaded...")
return dict(list(collected.items())[:max_results])
@retry_with_backoff(max_attempts=2, base_delay=1.5)
def extract_details(page):
"""Extract all details from an open business page."""
data = {
"name": "",
"address": "",
"phone": "",
"website": "",
"rating": 0.0,
"review_count": 0,
"category": "",
"hours": "",
"maps_url": page.url,
}
# Get body text
try:
body = page.locator("body").inner_text(timeout=5000)
lines = [l.strip() for l in body.split("\n") if l.strip()]
except Exception:
lines = []
# Parse rating and reviews from body text
for i, line in enumerate(lines):
if re.match(r"^\d\.\d$", line):
data["rating"] = float(line)
if i + 1 < len(lines):
rm = re.match(r"^\((\d[\d,]*)\)$", lines[i + 1])
if rm:
data["review_count"] = int(rm.group(1).replace(",", ""))
if i + 2 < len(lines):
candidate = lines[i + 2]
if len(candidate) < 60 and candidate not in ("Overview", "Reviews", "About"):
data["category"] = candidate
break
# Name from h1
try:
data["name"] = page.locator("h1").first.inner_text(timeout=3000).strip()
except Exception:
for i, line in enumerate(lines):
if re.match(r"^\d\.\d$", line) and i > 0:
data["name"] = lines[i - 1]
break
# Address
try:
addr_btn = page.locator('button[data-item-id="address"]').first
if addr_btn.count() > 0:
aria = addr_btn.get_attribute("aria-label") or ""
data["address"] = aria.replace("Address: ", "").strip()
except Exception:
pass
# Phone
try:
phone_btns = page.locator('button[data-item-id^="phone"]').all()
for btn in phone_btns:
aria = btn.get_attribute("aria-label") or ""
if aria.startswith("Phone:"):
data["phone"] = aria.replace("Phone: ", "").strip()
break
except Exception:
pass
# Website
try:
website_links = page.locator('a[aria-label^="Website:"]').all()
if website_links:
aria = website_links[0].get_attribute("aria-label") or ""
data["website"] = aria.replace("Website: ", "").strip()
else:
all_links = page.locator("a").all()
for link in all_links:
try:
href = link.get_attribute("href") or ""
if (href.startswith("http") and
"google.com" not in href and
"gstatic.com" not in href and
"ggpht.com" not in href and
"tel:" not in href and
len(href) > 10):
data["website"] = href
break
except Exception:
continue
except Exception:
pass
# Hours
try:
hours_btn = page.locator('button[data-item-id="oh"]').first
if hours_btn.count() > 0:
aria = hours_btn.get_attribute("aria-label") or ""
data["hours"] = aria.strip()
except Exception:
pass
return data
def main():
args = parse_args()
# Setup logging
logger = setup_logger('gmb_scraper')
stats = ScraperStats(logger)
# Setup output path
if not args.output:
ts = datetime.now().strftime("%Y%m%d_%H%M%S")
safe = re.sub(r"[^\w]", "_", args.query)[:40]
args.output = f"/root/.hermes/cache/gmb/{safe}_{ts}.csv"
Path(args.output).parent.mkdir(parents=True, exist_ok=True)
# Banner
print("=" * 80)
print(" 🗺️ GMB Scraper v4 — Pain-Aware Lead Generation")
print("=" * 80)
print(f" Query: {args.query}")
print(f" Max results: {args.max_results}")
print(f" Min rating: {args.min_rating}")
print(f" Min reviews: {args.min_reviews}")
print(f" Pain detection: {'' if args.detect_pain else ''}")
print(f" Review scraping: {'' if args.scrape_reviews else ''}")
print(f" Website checks: {'' if args.check_websites else ''}")
print(f" Pitch report: {'' if args.pitch_report else ''}")
print(f" Output: {args.output}")
print("=" * 80)
results = []
encoded = urllib.parse.quote_plus(args.query)
url = f"https://www.google.com/maps/search/{encoded}"
with sync_playwright() as pw:
browser = pw.chromium.launch(
headless=not args.headful,
args=["--disable-blink-features=AutomationControlled", "--no-sandbox", "--disable-dev-shm-usage"],
)
# Create context
if args.no_stealth:
context = browser.new_context(
viewport={"width": 1920, "height": 1080},
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36",
locale="en-AU",
timezone_id="Australia/Perth",
)
else:
context = create_stealth_context(browser, headless=not args.headful, proxy=args.proxy)
page = context.new_page()
# Apply stealth to page
if not args.no_stealth:
apply_stealth(context, page)
# Block images/fonts for speed
page.route("**/*.{png,jpg,jpeg,gif,svg,webp,ico,woff,woff2}", lambda r: r.abort())
# Navigate
logger.info(f"Searching: \"{args.query}\"")
page.goto(url, wait_until="domcontentloaded", timeout=30000)
time.sleep(human_delay(3, 5))
# Wait for feed
try:
page.wait_for_selector('[role="feed"]', timeout=15000)
except PwTimeout:
logger.warning("Feed slow, waiting extra...")
time.sleep(5)
# Scroll and collect
logger.info(f"Scrolling for results (max {args.max_results})...")
hrefs = scroll_feed(page, args.max_results, args.slow)
logger.info(f"Found {len(hrefs)} businesses")
stats.increment('businesses_found', len(hrefs))
if not hrefs:
logger.error("No results found. Try a broader query.")
browser.close()
return
# Visit each business page
total = len(hrefs)
for i, (href, aria_name) in enumerate(hrefs.items()):
progress = f"[{i+1}/{total}]"
print(f"\n{progress} {aria_name[:45]:<45}", end="")
try:
page.goto(href, wait_until="domcontentloaded", timeout=15000)
time.sleep(human_delay(1.0, 2.0))
try:
page.wait_for_selector("h1", timeout=5000)
except PwTimeout:
pass
# Extract basic details
data = extract_details(page)
# Apply basic filters
if data["rating"] > 0 and data["rating"] < args.min_rating:
print(f" ⏭️ {data['rating']}★ < {args.min_rating}")
stats.increment('businesses_filtered')
continue
if data["review_count"] < args.min_reviews:
print(f" ⏭️ {data['review_count']} reviews < {args.min_reviews}")
stats.increment('businesses_filtered')
continue
# Validate lead
data, is_valid, issues = validate_lead(data)
if not is_valid:
print(f" ⏭️ Invalid lead")
stats.increment('businesses_filtered')
continue
stats.increment('businesses_scraped')
# === PAIN DETECTION ===
pain_data = None
reviews = []
health_check = None
if args.detect_pain:
print(" 🔍", end="")
# Scrape reviews if requested
if args.scrape_reviews:
print(" 📝", end="")
try:
reviews = scrape_reviews(page, max_reviews=args.max_reviews)
stats.increment('reviews_scraped', len(reviews))
except Exception as e:
logger.warning(f"Review scrape failed for {data['name']}: {e}")
# Detect pain signals
pain_data = detect_pain_signals(data, reviews=reviews, health_check=None)
stats.increment('pain_signals_detected', pain_data['signal_count'])
# Filter by pain score
if pain_data['pain_score'] < args.min_pain:
print(f" ⏭️ pain={pain_data['pain_score']} < {args.min_pain}")
stats.increment('businesses_filtered')
continue
# Store data
data['reviews'] = reviews if args.scrape_reviews else []
data['pain_data'] = pain_data
results.append(data)
# Print status
if pain_data:
print(f"{data['rating']}★ ({data['review_count']}r) pain={pain_data['pain_score']}", end="")
else:
web_flag = "🌐" if data["website"] else ""
print(f"{data['rating']}★ ({data['review_count']}r) {web_flag}", end="")
# Anti-detection pause every 10 items
if (i + 1) % 10 == 0:
p = random.uniform(3, 6)
logger.debug(f"Anti-detection pause: {p:.1f}s")
time.sleep(p)
except Exception as e:
logger.error(f"Error scraping {aria_name}: {e}")
stats.increment('errors')
continue
browser.close()
# === POST-PROCESSING: Website Health Checks ===
if args.check_websites and args.detect_pain:
print(f"\n{'='*80}")
print(f"🔍 Checking website health for {len(results)} businesses...")
print(f"{'='*80}")
for i, lead in enumerate(results):
if lead.get('website'):
print(f" [{i+1}/{len(results)}] {lead['name'][:40]:<40}", end="")
try:
health = check_website_health(lead['website'])
stats.increment('websites_checked')
lead['health_check'] = health
# Re-detect pain with health data
pain_data = detect_pain_signals(
lead,
reviews=lead.get('reviews', []),
health_check=health
)
lead['pain_data'] = pain_data
issues = len(health.get('issues', []))
print(f"{'' if health.get('reachable') else ''} "
f"ssl={'' if health.get('ssl_valid') else ''} "
f"load={health.get('load_time', 0):.1f}s "
f"pain={pain_data['pain_score']}")
except Exception as e:
print(f"{str(e)[:50]}")
lead['health_check'] = None
else:
print(f" [{i+1}/{len(results)}] {lead['name'][:40]:<40} → No website")
# === POST-PROCESSING: Pitch Generation ===
if args.pitch_report and args.detect_pain:
print(f"\n{'='*80}")
print(f"📝 Generating apex pitches ({args.channel})...")
print(f"{'='*80}")
for i, lead in enumerate(results):
pain_data = lead.get('pain_data')
if pain_data and pain_data.get('pain_score', 0) > 0:
pitch = generate_apex_pitch(lead, pain_data, channel=args.channel)
lead['pitch'] = pitch
stats.increment('pitches_generated')
print(f" [{i+1}/{len(results)}] {lead['name'][:40]:<40}{pitch['primary_service']}")
# === OUTPUT ===
print(f"\n{'='*80}")
print(f"📊 RESULTS: {len(results)} businesses (filtered from {total})")
print(f"{'='*80}")
if not results:
logger.error("No results matched filters. Try lowering thresholds.")
stats.log_summary()
return
# Build CSV fields
fields = ["name", "address", "phone", "website", "rating", "review_count", "category", "hours", "maps_url"]
if args.detect_pain:
fields.extend(["pain_score", "pain_signals", "primary_service", "confidence"])
if args.check_websites:
fields.extend(["website_reachable", "website_ssl", "website_load_time", "website_mobile", "website_form"])
if args.pitch_report:
fields.append("pitch")
# Write CSV
with open(args.output, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fields, extrasaction='ignore')
writer.writeheader()
for lead in results:
row = lead.copy()
# Add pain data
if args.detect_pain and lead.get('pain_data'):
pd = lead['pain_data']
row['pain_score'] = pd['pain_score']
row['pain_signals'] = '; '.join(pd['signals'].keys())
row['primary_service'] = pd.get('primary_service', '')
row['confidence'] = pd.get('confidence', '')
# Add health data
if args.check_websites and lead.get('health_check'):
hc = lead['health_check']
row['website_reachable'] = hc.get('reachable', False)
row['website_ssl'] = hc.get('ssl_valid', False)
row['website_load_time'] = hc.get('load_time', 0)
row['website_mobile'] = hc.get('mobile_friendly', False)
row['website_form'] = hc.get('has_contact_form', False)
# Add pitch
if args.pitch_report and lead.get('pitch'):
row['pitch'] = lead['pitch']['pitch']
writer.writerow(row)
print(f"\n💾 CSV: {args.output}")
# Write JSON
if args.json:
jp = args.output.replace(".csv", ".json")
with open(jp, "w") as f:
json.dump(results, f, indent=2, ensure_ascii=False, default=str)
print(f"💾 JSON: {jp}")
# Write "latest" symlink
safe = re.sub(r"[^\w]", "_", args.query)[:40]
latest = f"/root/.hermes/cache/gmb/{safe}_latest.csv"
with open(latest, "w", newline="", encoding="utf-8") as f:
writer = csv.DictWriter(f, fieldnames=fields, extrasaction='ignore')
writer.writeheader()
for lead in results:
row = lead.copy()
if args.detect_pain and lead.get('pain_data'):
pd = lead['pain_data']
row['pain_score'] = pd['pain_score']
row['pain_signals'] = '; '.join(pd['signals'].keys())
row['primary_service'] = pd.get('primary_service', '')
row['confidence'] = pd.get('confidence', '')
if args.check_websites and lead.get('health_check'):
hc = lead['health_check']
row['website_reachable'] = hc.get('reachable', False)
row['website_ssl'] = hc.get('ssl_valid', False)
row['website_load_time'] = hc.get('load_time', 0)
row['website_mobile'] = hc.get('mobile_friendly', False)
row['website_form'] = hc.get('has_contact_form', False)
if args.pitch_report and lead.get('pitch'):
row['pitch'] = lead['pitch']['pitch']
writer.writerow(row)
# Print summary table
if args.detect_pain:
print(f"\n{'#':<3} {'NAME':<30} {'RATING':<6} {'PAIN':<5} {'SERVICE':<25} {'CONF':<5}")
print("-" * 80)
sorted_results = sorted(results, key=lambda x: x.get('pain_data', {}).get('pain_score', 0), reverse=True)
for i, r in enumerate(sorted_results[:30], 1):
name = (r["name"][:28] + "..") if len(r["name"]) > 30 else r["name"]
pd = r.get('pain_data', {})
pain = pd.get('pain_score', 0)
service = (pd.get('primary_service', '') or '')[:23]
conf = pd.get('confidence', '')
print(f"{i:<3} {name:<30} {r['rating']:<6} {pain:<5} {service:<25} {conf:<5}")
else:
print(f"\n{'#':<3} {'NAME':<35} {'RATING':<6} {'REV':<5} {'PHONE':<16} {'WEB':<4}")
print("-" * 75)
for i, r in enumerate(results[:30], 1):
name = (r["name"][:33] + "..") if len(r["name"]) > 35 else r["name"]
phone = (r["phone"][:14]) if r["phone"] else ""
web = "" if r["website"] else ""
print(f"{i:<3} {name:<35} {r['rating']:<6} {r['review_count']:<5} {phone:<16} {web}")
if len(results) > 30:
print(f"\n ... +{len(results)-30} more in CSV")
# Stats
with_web = sum(1 for r in results if r["website"])
with_phone = sum(1 for r in results if r["phone"])
avg_r = sum(r["rating"] for r in results if r["rating"] > 0) / max(sum(1 for r in results if r["rating"] > 0), 1)
print(f"\n📈 Stats:")
print(f" Total: {len(results)} businesses")
print(f" Avg rating: {avg_r:.1f}")
print(f" With website: {with_web}")
print(f" With phone: {with_phone}")
if args.detect_pain:
high_pain = sum(1 for r in results if r.get('pain_data', {}).get('pain_score', 0) >= 30)
med_pain = sum(1 for r in results if 15 <= r.get('pain_data', {}).get('pain_score', 0) < 30)
low_pain = sum(1 for r in results if 0 < r.get('pain_data', {}).get('pain_score', 0) < 15)
print(f" High pain: {high_pain} (score ≥30)")
print(f" Medium pain: {med_pain} (score 15-29)")
print(f" Low pain: {low_pain} (score 1-14)")
# Log final stats
stats.log_summary()
print(f"\n🎯 Done! Results saved to {args.output}")
if __name__ == "__main__":
main()