- Stealth mode: playwright-stealth, random fingerprints, human delays - Retry logic: exponential backoff (3 attempts) - Logging: rotating logs to /root/.hermes/logs/gmb/ - Validation: phone/website/rating validation + dedup - Pain detection: 12 signals, scoring, service matching - Review scraper: extract reviews + pain keyword detection - Website health: SSL, speed, mobile, contact form checks - Pitch generator: Apex pitches (SMS, email, call, Gumtree) - Docker containerization - .env for secrets (no hardcoded API keys) - Integration with Pipecat voice dialer (gmb_to_voice.py)
201 lines
5 KiB
Python
201 lines
5 KiB
Python
"""
|
|
Data Validation Module
|
|
======================
|
|
Validate and clean scraped business data.
|
|
"""
|
|
|
|
import re
|
|
from urllib.parse import urlparse
|
|
from .logger import get_logger
|
|
|
|
|
|
def validate_phone(phone):
|
|
"""
|
|
Validate and normalize Australian phone numbers.
|
|
|
|
Args:
|
|
phone: Raw phone string
|
|
|
|
Returns:
|
|
Normalized phone string or None if invalid
|
|
"""
|
|
if not phone:
|
|
return None
|
|
|
|
# Remove all non-digit characters except +
|
|
cleaned = re.sub(r'[^\d+]', '', phone)
|
|
|
|
# Australian number patterns
|
|
patterns = [
|
|
r'^\+61\d{9}$', # +61 XXXXXXXXX (international)
|
|
r'^0\d{9}$', # 0XXXXXXXXX (landline/mobile)
|
|
r'^1[389]00\d{6}$', # 1300/1800/1900 numbers
|
|
]
|
|
|
|
for pattern in patterns:
|
|
if re.match(pattern, cleaned):
|
|
# Normalize to Australian format
|
|
if cleaned.startswith('+61'):
|
|
return '0' + cleaned[3:]
|
|
return cleaned
|
|
|
|
return None
|
|
|
|
|
|
def validate_website(website):
|
|
"""
|
|
Validate website URL.
|
|
|
|
Args:
|
|
website: Raw website string
|
|
|
|
Returns:
|
|
Cleaned URL or None if invalid
|
|
"""
|
|
if not website:
|
|
return None
|
|
|
|
# Remove whitespace
|
|
website = website.strip()
|
|
|
|
# Add https:// if missing
|
|
if not website.startswith(('http://', 'https://')):
|
|
website = 'https://' + website
|
|
|
|
# Validate URL structure
|
|
try:
|
|
parsed = urlparse(website)
|
|
if not parsed.netloc or '.' not in parsed.netloc:
|
|
return None
|
|
|
|
# Filter out Google domains (common scraping artifact)
|
|
if 'google.com' in parsed.netloc or 'gstatic.com' in parsed.netloc:
|
|
return None
|
|
|
|
return website
|
|
except Exception:
|
|
return None
|
|
|
|
|
|
def validate_rating(rating):
|
|
"""
|
|
Validate rating value.
|
|
|
|
Args:
|
|
rating: Rating value (float or string)
|
|
|
|
Returns:
|
|
Float rating or 0.0 if invalid
|
|
"""
|
|
try:
|
|
rating = float(rating)
|
|
if 0.0 <= rating <= 5.0:
|
|
return rating
|
|
except (ValueError, TypeError):
|
|
pass
|
|
return 0.0
|
|
|
|
|
|
def validate_review_count(count):
|
|
"""
|
|
Validate review count.
|
|
|
|
Args:
|
|
count: Review count (int or string)
|
|
|
|
Returns:
|
|
Integer count or 0 if invalid
|
|
"""
|
|
try:
|
|
if isinstance(count, str):
|
|
count = count.replace(',', '').strip()
|
|
count = int(count)
|
|
return max(0, count)
|
|
except (ValueError, TypeError):
|
|
return 0
|
|
|
|
|
|
def validate_lead(lead):
|
|
"""
|
|
Validate and clean a complete lead record.
|
|
|
|
Args:
|
|
lead: Dictionary with business data
|
|
|
|
Returns:
|
|
Tuple of (validated_lead, is_valid, issues)
|
|
"""
|
|
logger = get_logger()
|
|
issues = []
|
|
|
|
# Create cleaned copy
|
|
cleaned = lead.copy()
|
|
|
|
# Validate name
|
|
if not cleaned.get('name') or len(cleaned['name']) < 2:
|
|
issues.append("Missing or invalid name")
|
|
cleaned['name'] = ""
|
|
|
|
# Validate phone
|
|
original_phone = cleaned.get('phone', '')
|
|
cleaned['phone'] = validate_phone(original_phone)
|
|
if original_phone and not cleaned['phone']:
|
|
issues.append(f"Invalid phone: {original_phone}")
|
|
|
|
# Validate website
|
|
original_website = cleaned.get('website', '')
|
|
cleaned['website'] = validate_website(original_website)
|
|
if original_website and not cleaned['website']:
|
|
issues.append(f"Invalid website: {original_website}")
|
|
|
|
# Validate rating
|
|
cleaned['rating'] = validate_rating(cleaned.get('rating', 0))
|
|
|
|
# Validate review count
|
|
cleaned['review_count'] = validate_review_count(cleaned.get('review_count', 0))
|
|
|
|
# Check for common garbage patterns
|
|
garbage_names = [
|
|
"closed", "permanently closed", "temporarily closed",
|
|
"out of business", "no longer operating"
|
|
]
|
|
if any(garbage in cleaned['name'].lower() for garbage in garbage_names):
|
|
issues.append(f"Business appears closed: {cleaned['name']}")
|
|
|
|
# Log issues
|
|
if issues:
|
|
logger.warning(f"Validation issues for '{cleaned.get('name', 'Unknown')}': {', '.join(issues)}")
|
|
|
|
# Determine if lead is valid enough to keep
|
|
is_valid = (
|
|
cleaned['name'] and
|
|
(cleaned['phone'] or cleaned['website']) # Need at least one contact method
|
|
)
|
|
|
|
return cleaned, is_valid, issues
|
|
|
|
|
|
def deduplicate_leads(leads, key='maps_url'):
|
|
"""
|
|
Remove duplicate leads based on a key field.
|
|
|
|
Args:
|
|
leads: List of lead dictionaries
|
|
key: Field to use for deduplication
|
|
|
|
Returns:
|
|
Deduplicated list
|
|
"""
|
|
seen = set()
|
|
unique_leads = []
|
|
|
|
for lead in leads:
|
|
identifier = lead.get(key, '')
|
|
if identifier and identifier not in seen:
|
|
seen.add(identifier)
|
|
unique_leads.append(lead)
|
|
elif not identifier:
|
|
# Keep leads without the key field
|
|
unique_leads.append(lead)
|
|
|
|
return unique_leads
|