""" Data Validation Module ====================== Validate and clean scraped business data. """ import re from urllib.parse import urlparse from .logger import get_logger def validate_phone(phone): """ Validate and normalize Australian phone numbers. Args: phone: Raw phone string Returns: Normalized phone string or None if invalid """ if not phone: return None # Remove all non-digit characters except + cleaned = re.sub(r'[^\d+]', '', phone) # Australian number patterns patterns = [ r'^\+61\d{9}$', # +61 XXXXXXXXX (international) r'^0\d{9}$', # 0XXXXXXXXX (landline/mobile) r'^1[389]00\d{6}$', # 1300/1800/1900 numbers ] for pattern in patterns: if re.match(pattern, cleaned): # Normalize to Australian format if cleaned.startswith('+61'): return '0' + cleaned[3:] return cleaned return None def validate_website(website): """ Validate website URL. Args: website: Raw website string Returns: Cleaned URL or None if invalid """ if not website: return None # Remove whitespace website = website.strip() # Add https:// if missing if not website.startswith(('http://', 'https://')): website = 'https://' + website # Validate URL structure try: parsed = urlparse(website) if not parsed.netloc or '.' not in parsed.netloc: return None # Filter out Google domains (common scraping artifact) if 'google.com' in parsed.netloc or 'gstatic.com' in parsed.netloc: return None return website except Exception: return None def validate_rating(rating): """ Validate rating value. Args: rating: Rating value (float or string) Returns: Float rating or 0.0 if invalid """ try: rating = float(rating) if 0.0 <= rating <= 5.0: return rating except (ValueError, TypeError): pass return 0.0 def validate_review_count(count): """ Validate review count. Args: count: Review count (int or string) Returns: Integer count or 0 if invalid """ try: if isinstance(count, str): count = count.replace(',', '').strip() count = int(count) return max(0, count) except (ValueError, TypeError): return 0 def validate_lead(lead): """ Validate and clean a complete lead record. Args: lead: Dictionary with business data Returns: Tuple of (validated_lead, is_valid, issues) """ logger = get_logger() issues = [] # Create cleaned copy cleaned = lead.copy() # Validate name if not cleaned.get('name') or len(cleaned['name']) < 2: issues.append("Missing or invalid name") cleaned['name'] = "" # Validate phone original_phone = cleaned.get('phone', '') cleaned['phone'] = validate_phone(original_phone) if original_phone and not cleaned['phone']: issues.append(f"Invalid phone: {original_phone}") # Validate website original_website = cleaned.get('website', '') cleaned['website'] = validate_website(original_website) if original_website and not cleaned['website']: issues.append(f"Invalid website: {original_website}") # Validate rating cleaned['rating'] = validate_rating(cleaned.get('rating', 0)) # Validate review count cleaned['review_count'] = validate_review_count(cleaned.get('review_count', 0)) # Check for common garbage patterns garbage_names = [ "closed", "permanently closed", "temporarily closed", "out of business", "no longer operating" ] if any(garbage in cleaned['name'].lower() for garbage in garbage_names): issues.append(f"Business appears closed: {cleaned['name']}") # Log issues if issues: logger.warning(f"Validation issues for '{cleaned.get('name', 'Unknown')}': {', '.join(issues)}") # Determine if lead is valid enough to keep is_valid = ( cleaned['name'] and (cleaned['phone'] or cleaned['website']) # Need at least one contact method ) return cleaned, is_valid, issues def deduplicate_leads(leads, key='maps_url'): """ Remove duplicate leads based on a key field. Args: leads: List of lead dictionaries key: Field to use for deduplication Returns: Deduplicated list """ seen = set() unique_leads = [] for lead in leads: identifier = lead.get(key, '') if identifier and identifier not in seen: seen.add(identifier) unique_leads.append(lead) elif not identifier: # Keep leads without the key field unique_leads.append(lead) return unique_leads