- Stealth mode: playwright-stealth, random fingerprints, human delays - Retry logic: exponential backoff (3 attempts) - Logging: rotating logs to /root/.hermes/logs/gmb/ - Validation: phone/website/rating validation + dedup - Pain detection: 12 signals, scoring, service matching - Review scraper: extract reviews + pain keyword detection - Website health: SSL, speed, mobile, contact form checks - Pitch generator: Apex pitches (SMS, email, call, Gumtree) - Docker containerization - .env for secrets (no hardcoded API keys) - Integration with Pipecat voice dialer (gmb_to_voice.py)
258 lines
8.3 KiB
Python
258 lines
8.3 KiB
Python
"""
|
|
Website Health Checker Module
|
|
==============================
|
|
Check website health: SSL, speed, mobile-friendly, forms.
|
|
"""
|
|
|
|
import ssl
|
|
import re
|
|
import socket
|
|
import time
|
|
import requests
|
|
from urllib.parse import urlparse
|
|
from bs4 import BeautifulSoup
|
|
from .logger import get_logger
|
|
from .retry import retry_with_backoff
|
|
|
|
|
|
@retry_with_backoff(max_attempts=2, base_delay=1.0)
|
|
def check_website_health(website_url, timeout=10):
|
|
"""
|
|
Comprehensive website health check.
|
|
|
|
Args:
|
|
website_url: Website URL to check
|
|
timeout: Request timeout in seconds
|
|
|
|
Returns:
|
|
Dictionary with health check results
|
|
"""
|
|
logger = get_logger()
|
|
|
|
if not website_url:
|
|
return None
|
|
|
|
result = {
|
|
'url': website_url,
|
|
'reachable': False,
|
|
'ssl_valid': False,
|
|
'ssl_expiry': None,
|
|
'load_time': 0,
|
|
'status_code': 0,
|
|
'mobile_friendly': False,
|
|
'has_contact_form': False,
|
|
'has_phone_number': False,
|
|
'has_email': False,
|
|
'title': '',
|
|
'meta_description': '',
|
|
'issues': [],
|
|
}
|
|
|
|
try:
|
|
# Parse URL
|
|
parsed = urlparse(website_url)
|
|
hostname = parsed.hostname
|
|
|
|
if not hostname:
|
|
result['issues'].append("Invalid URL")
|
|
return result
|
|
|
|
# Check SSL certificate
|
|
ssl_result = check_ssl(hostname)
|
|
result['ssl_valid'] = ssl_result['valid']
|
|
result['ssl_expiry'] = ssl_result['expiry']
|
|
if not ssl_result['valid']:
|
|
result['issues'].append(f"SSL issue: {ssl_result['error']}")
|
|
|
|
# Check reachability and load time
|
|
start_time = time.time()
|
|
response = requests.get(
|
|
website_url,
|
|
timeout=timeout,
|
|
headers={
|
|
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 17_0 like Mac OS X) AppleWebKit/605.1.15'
|
|
},
|
|
allow_redirects=True
|
|
)
|
|
load_time = time.time() - start_time
|
|
|
|
result['reachable'] = response.status_code == 200
|
|
result['status_code'] = response.status_code
|
|
result['load_time'] = round(load_time, 2)
|
|
|
|
if response.status_code != 200:
|
|
result['issues'].append(f"HTTP {response.status_code}")
|
|
|
|
# Parse HTML
|
|
soup = BeautifulSoup(response.text, 'lxml')
|
|
|
|
# Check title
|
|
title = soup.find('title')
|
|
result['title'] = title.get_text().strip() if title else ''
|
|
if not result['title']:
|
|
result['issues'].append("Missing page title")
|
|
|
|
# Check meta description
|
|
meta_desc = soup.find('meta', attrs={'name': 'description'})
|
|
result['meta_description'] = meta_desc.get('content', '') if meta_desc else ''
|
|
if not result['meta_description']:
|
|
result['issues'].append("Missing meta description")
|
|
|
|
# Check mobile-friendly (viewport meta tag)
|
|
viewport = soup.find('meta', attrs={'name': 'viewport'})
|
|
result['mobile_friendly'] = viewport is not None and 'width' in viewport.get('content', '')
|
|
if not result['mobile_friendly']:
|
|
result['issues'].append("Not mobile-friendly (no viewport meta)")
|
|
|
|
# Check for contact form
|
|
forms = soup.find_all('form')
|
|
contact_keywords = ['contact', 'inquiry', 'quote', 'book', 'appointment', 'message']
|
|
for form in forms:
|
|
action = form.get('action', '') or ''
|
|
form_id = form.get('id', '') or ''
|
|
form_class = form.get('class', [])
|
|
# BeautifulSoup returns class as a list
|
|
if isinstance(form_class, list):
|
|
form_class = ' '.join(form_class)
|
|
form_attrs = f"{action} {form_id} {form_class}".lower()
|
|
if any(kw in form_attrs for kw in contact_keywords):
|
|
result['has_contact_form'] = True
|
|
break
|
|
else:
|
|
# Any form is better than none
|
|
result['has_contact_form'] = len(forms) > 0
|
|
|
|
if not result['has_contact_form']:
|
|
result['issues'].append("No contact form found")
|
|
|
|
# Check for phone number
|
|
page_text = soup.get_text()
|
|
phone_pattern = r'(\+61|0[2-8])\s*\d[\d\s-]{7,9}'
|
|
result['has_phone_number'] = bool(
|
|
re.search(phone_pattern, page_text)
|
|
)
|
|
if not result['has_phone_number']:
|
|
result['issues'].append("No phone number found")
|
|
|
|
# Check for email
|
|
email_pattern = r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'
|
|
result['has_email'] = bool(
|
|
re.search(email_pattern, page_text)
|
|
)
|
|
|
|
# Check load time
|
|
if load_time > 3.0:
|
|
result['issues'].append(f"Slow load time: {load_time:.1f}s")
|
|
|
|
logger.info(
|
|
f"Website health for {hostname}: "
|
|
f"reachable={result['reachable']}, "
|
|
f"ssl={result['ssl_valid']}, "
|
|
f"load={load_time:.1f}s, "
|
|
f"mobile={result['mobile_friendly']}, "
|
|
f"form={result['has_contact_form']}, "
|
|
f"issues={len(result['issues'])}"
|
|
)
|
|
|
|
return result
|
|
|
|
except requests.Timeout:
|
|
result['issues'].append("Request timeout")
|
|
logger.warning(f"Website timeout: {website_url}")
|
|
return result
|
|
except requests.ConnectionError:
|
|
result['issues'].append("Connection failed")
|
|
logger.warning(f"Website unreachable: {website_url}")
|
|
return result
|
|
except Exception as e:
|
|
result['issues'].append(f"Error: {str(e)[:100]}")
|
|
logger.error(f"Website check error for {website_url}: {e}")
|
|
return result
|
|
|
|
|
|
def check_ssl(hostname, port=443):
|
|
"""
|
|
Check SSL certificate validity and expiry.
|
|
|
|
Args:
|
|
hostname: Domain name
|
|
port: SSL port
|
|
|
|
Returns:
|
|
Dictionary with SSL info
|
|
"""
|
|
result = {
|
|
'valid': False,
|
|
'expiry': None,
|
|
'error': None,
|
|
}
|
|
|
|
try:
|
|
context = ssl.create_default_context()
|
|
with socket.create_connection((hostname, port), timeout=5) as sock:
|
|
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
|
cert = ssock.getpeercert()
|
|
|
|
# Check expiry
|
|
expiry_str = cert.get('notAfter')
|
|
if expiry_str:
|
|
from datetime import datetime
|
|
expiry = datetime.strptime(expiry_str, '%b %d %H:%M:%S %Y %Z')
|
|
result['expiry'] = expiry.isoformat()
|
|
|
|
# Check if expired or expiring soon
|
|
days_until_expiry = (expiry - datetime.now()).days
|
|
result['valid'] = days_until_expiry > 7
|
|
|
|
if days_until_expiry <= 0:
|
|
result['error'] = "SSL certificate expired"
|
|
elif days_until_expiry <= 7:
|
|
result['error'] = f"SSL expires in {days_until_expiry} days"
|
|
|
|
return result
|
|
|
|
except ssl.SSLError as e:
|
|
result['error'] = f"SSL error: {str(e)[:100]}"
|
|
return result
|
|
except socket.timeout:
|
|
result['error'] = "Connection timeout"
|
|
return result
|
|
except Exception as e:
|
|
result['error'] = str(e)[:100]
|
|
return result
|
|
|
|
|
|
def batch_health_check(websites, max_workers=5):
|
|
"""
|
|
Check multiple websites in parallel.
|
|
|
|
Args:
|
|
websites: List of website URLs
|
|
max_workers: Number of parallel workers
|
|
|
|
Returns:
|
|
List of health check results
|
|
"""
|
|
from concurrent.futures import ThreadPoolExecutor, as_completed
|
|
|
|
results = []
|
|
|
|
with ThreadPoolExecutor(max_workers=max_workers) as executor:
|
|
future_to_url = {
|
|
executor.submit(check_website_health, url): url
|
|
for url in websites if url
|
|
}
|
|
|
|
for future in as_completed(future_to_url):
|
|
url = future_to_url[future]
|
|
try:
|
|
result = future.result()
|
|
results.append(result)
|
|
except Exception as e:
|
|
results.append({
|
|
'url': url,
|
|
'reachable': False,
|
|
'issues': [f"Error: {str(e)[:100]}"],
|
|
})
|
|
|
|
return results
|