GMB-Scraper/lib/validator.py

202 lines
5 KiB
Python
Raw Permalink Normal View History

"""
Data Validation Module
======================
Validate and clean scraped business data.
"""
import re
from urllib.parse import urlparse
from .logger import get_logger
def validate_phone(phone):
"""
Validate and normalize Australian phone numbers.
Args:
phone: Raw phone string
Returns:
Normalized phone string or None if invalid
"""
if not phone:
return None
# Remove all non-digit characters except +
cleaned = re.sub(r'[^\d+]', '', phone)
# Australian number patterns
patterns = [
r'^\+61\d{9}$', # +61 XXXXXXXXX (international)
r'^0\d{9}$', # 0XXXXXXXXX (landline/mobile)
r'^1[389]00\d{6}$', # 1300/1800/1900 numbers
]
for pattern in patterns:
if re.match(pattern, cleaned):
# Normalize to Australian format
if cleaned.startswith('+61'):
return '0' + cleaned[3:]
return cleaned
return None
def validate_website(website):
"""
Validate website URL.
Args:
website: Raw website string
Returns:
Cleaned URL or None if invalid
"""
if not website:
return None
# Remove whitespace
website = website.strip()
# Add https:// if missing
if not website.startswith(('http://', 'https://')):
website = 'https://' + website
# Validate URL structure
try:
parsed = urlparse(website)
if not parsed.netloc or '.' not in parsed.netloc:
return None
# Filter out Google domains (common scraping artifact)
if 'google.com' in parsed.netloc or 'gstatic.com' in parsed.netloc:
return None
return website
except Exception:
return None
def validate_rating(rating):
"""
Validate rating value.
Args:
rating: Rating value (float or string)
Returns:
Float rating or 0.0 if invalid
"""
try:
rating = float(rating)
if 0.0 <= rating <= 5.0:
return rating
except (ValueError, TypeError):
pass
return 0.0
def validate_review_count(count):
"""
Validate review count.
Args:
count: Review count (int or string)
Returns:
Integer count or 0 if invalid
"""
try:
if isinstance(count, str):
count = count.replace(',', '').strip()
count = int(count)
return max(0, count)
except (ValueError, TypeError):
return 0
def validate_lead(lead):
"""
Validate and clean a complete lead record.
Args:
lead: Dictionary with business data
Returns:
Tuple of (validated_lead, is_valid, issues)
"""
logger = get_logger()
issues = []
# Create cleaned copy
cleaned = lead.copy()
# Validate name
if not cleaned.get('name') or len(cleaned['name']) < 2:
issues.append("Missing or invalid name")
cleaned['name'] = ""
# Validate phone
original_phone = cleaned.get('phone', '')
cleaned['phone'] = validate_phone(original_phone)
if original_phone and not cleaned['phone']:
issues.append(f"Invalid phone: {original_phone}")
# Validate website
original_website = cleaned.get('website', '')
cleaned['website'] = validate_website(original_website)
if original_website and not cleaned['website']:
issues.append(f"Invalid website: {original_website}")
# Validate rating
cleaned['rating'] = validate_rating(cleaned.get('rating', 0))
# Validate review count
cleaned['review_count'] = validate_review_count(cleaned.get('review_count', 0))
# Check for common garbage patterns
garbage_names = [
"closed", "permanently closed", "temporarily closed",
"out of business", "no longer operating"
]
if any(garbage in cleaned['name'].lower() for garbage in garbage_names):
issues.append(f"Business appears closed: {cleaned['name']}")
# Log issues
if issues:
logger.warning(f"Validation issues for '{cleaned.get('name', 'Unknown')}': {', '.join(issues)}")
# Determine if lead is valid enough to keep
is_valid = (
cleaned['name'] and
(cleaned['phone'] or cleaned['website']) # Need at least one contact method
)
return cleaned, is_valid, issues
def deduplicate_leads(leads, key='maps_url'):
"""
Remove duplicate leads based on a key field.
Args:
leads: List of lead dictionaries
key: Field to use for deduplication
Returns:
Deduplicated list
"""
seen = set()
unique_leads = []
for lead in leads:
identifier = lead.get(key, '')
if identifier and identifier not in seen:
seen.add(identifier)
unique_leads.append(lead)
elif not identifier:
# Keep leads without the key field
unique_leads.append(lead)
return unique_leads