228 lines
6.9 KiB
Python
228 lines
6.9 KiB
Python
|
|
"""
|
||
|
|
Review Scraper Module
|
||
|
|
=====================
|
||
|
|
Extract reviews from Google Maps business pages.
|
||
|
|
"""
|
||
|
|
|
||
|
|
import re
|
||
|
|
from datetime import datetime, timedelta
|
||
|
|
from .logger import get_logger
|
||
|
|
from .retry import retry_with_backoff
|
||
|
|
|
||
|
|
|
||
|
|
def parse_relative_date(date_string):
|
||
|
|
"""
|
||
|
|
Parse relative date strings like "2 days ago", "1 week ago".
|
||
|
|
|
||
|
|
Args:
|
||
|
|
date_string: Relative date string
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
datetime object or None
|
||
|
|
"""
|
||
|
|
if not date_string:
|
||
|
|
return None
|
||
|
|
|
||
|
|
now = datetime.now()
|
||
|
|
date_string = date_string.lower().strip()
|
||
|
|
|
||
|
|
# Patterns
|
||
|
|
patterns = [
|
||
|
|
(r'(\d+)\s+(second|minute|hour|day|week|month|year)s?\s+ago',
|
||
|
|
lambda m: {
|
||
|
|
'seconds': 1, 'minutes': 60, 'hours': 3600,
|
||
|
|
'days': 86400, 'weeks': 604800, 'months': 2592000,
|
||
|
|
'years': 31536000
|
||
|
|
}.get(m.group(2), 0) * int(m.group(1))),
|
||
|
|
]
|
||
|
|
|
||
|
|
for pattern, calc in patterns:
|
||
|
|
match = re.search(pattern, date_string)
|
||
|
|
if match:
|
||
|
|
seconds = calc(match)
|
||
|
|
return now - timedelta(seconds=seconds)
|
||
|
|
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
@retry_with_backoff(max_attempts=2, base_delay=1.0)
|
||
|
|
def scrape_reviews(page, max_reviews=50, days_back=90):
|
||
|
|
"""
|
||
|
|
Scrape reviews from an open Google Maps business page.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
page: Playwright page with business open
|
||
|
|
max_reviews: Maximum number of reviews to scrape
|
||
|
|
days_back: Only scrape reviews from last N days (0 = all)
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
List of review dictionaries
|
||
|
|
"""
|
||
|
|
logger = get_logger()
|
||
|
|
reviews = []
|
||
|
|
cutoff_date = datetime.now() - timedelta(days=days_back) if days_back > 0 else None
|
||
|
|
|
||
|
|
try:
|
||
|
|
# Click "Reviews" tab if not already there
|
||
|
|
try:
|
||
|
|
reviews_tab = page.locator('button[aria-label*="Reviews"]').first
|
||
|
|
if reviews_tab.count() > 0:
|
||
|
|
reviews_tab.click()
|
||
|
|
page.wait_for_timeout(1500)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Scroll to load more reviews
|
||
|
|
for scroll_iteration in range(20):
|
||
|
|
# Extract visible reviews
|
||
|
|
review_elements = page.locator('[data-review-id]').all()
|
||
|
|
|
||
|
|
if not review_elements:
|
||
|
|
# Try alternative selector
|
||
|
|
review_elements = page.locator('.OD1W0[role="article"], [jsaction*="reviewChart"]').all()
|
||
|
|
|
||
|
|
new_count = 0
|
||
|
|
for element in review_elements:
|
||
|
|
try:
|
||
|
|
review = extract_review_data(element)
|
||
|
|
if review and review['id'] not in [r['id'] for r in reviews]:
|
||
|
|
reviews.append(review)
|
||
|
|
new_count += 1
|
||
|
|
|
||
|
|
# Check date cutoff
|
||
|
|
if cutoff_date and review.get('date_parsed'):
|
||
|
|
if review['date_parsed'] < cutoff_date:
|
||
|
|
logger.debug(f"Reached cutoff date at review {len(reviews)}")
|
||
|
|
return reviews[:max_reviews]
|
||
|
|
|
||
|
|
if len(reviews) >= max_reviews:
|
||
|
|
return reviews
|
||
|
|
except Exception as e:
|
||
|
|
logger.debug(f"Error extracting review: {e}")
|
||
|
|
continue
|
||
|
|
|
||
|
|
if new_count == 0:
|
||
|
|
logger.debug(f"No new reviews after {scroll_iteration + 1} scrolls")
|
||
|
|
break
|
||
|
|
|
||
|
|
# Scroll down
|
||
|
|
try:
|
||
|
|
page.evaluate("""
|
||
|
|
const scrollable = document.querySelector('[role="feed"]') ||
|
||
|
|
document.querySelector('.m6QErb.DxyBCb.kA9KIf.dS8AEf');
|
||
|
|
if (scrollable) scrollable.scrollBy(0, 1000);
|
||
|
|
""")
|
||
|
|
page.wait_for_timeout(1000)
|
||
|
|
except Exception:
|
||
|
|
break
|
||
|
|
|
||
|
|
logger.info(f"Scraped {len(reviews)} reviews")
|
||
|
|
return reviews[:max_reviews]
|
||
|
|
|
||
|
|
except Exception as e:
|
||
|
|
logger.warning(f"Failed to scrape reviews: {e}")
|
||
|
|
return reviews
|
||
|
|
|
||
|
|
|
||
|
|
def extract_review_data(element):
|
||
|
|
"""
|
||
|
|
Extract review data from a review element.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
element: Playwright element
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Dictionary with review data
|
||
|
|
"""
|
||
|
|
try:
|
||
|
|
# Get review ID
|
||
|
|
review_id = element.get_attribute('data-review-id') or ''
|
||
|
|
if not review_id:
|
||
|
|
# Generate pseudo-ID from text
|
||
|
|
text = element.inner_text()[:50]
|
||
|
|
review_id = str(hash(text))
|
||
|
|
|
||
|
|
# Get rating
|
||
|
|
rating = 0
|
||
|
|
try:
|
||
|
|
rating_el = element.locator('[aria-label*="stars"], [aria-label*="Stars"]').first
|
||
|
|
if rating_el.count() > 0:
|
||
|
|
aria = rating_el.get_attribute('aria-label') or ''
|
||
|
|
match = re.search(r'(\d+)', aria)
|
||
|
|
if match:
|
||
|
|
rating = int(match.group(1))
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Get review text
|
||
|
|
text = ''
|
||
|
|
try:
|
||
|
|
text_el = element.locator('[class*="wiI7pd"], [jsaction*="reviewChart"] span').first
|
||
|
|
if text_el.count() > 0:
|
||
|
|
text = text_el.inner_text().strip()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Get date
|
||
|
|
date_string = ''
|
||
|
|
date_parsed = None
|
||
|
|
try:
|
||
|
|
date_el = element.locator('[class*="rsqaWe"], [class*="review-date"]').first
|
||
|
|
if date_el.count() > 0:
|
||
|
|
date_string = date_el.inner_text().strip()
|
||
|
|
date_parsed = parse_relative_date(date_string)
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
# Get reviewer name
|
||
|
|
reviewer = ''
|
||
|
|
try:
|
||
|
|
name_el = element.locator('[class*="d4r55"], [class*="reviewer-name"]').first
|
||
|
|
if name_el.count() > 0:
|
||
|
|
reviewer = name_el.inner_text().strip()
|
||
|
|
except Exception:
|
||
|
|
pass
|
||
|
|
|
||
|
|
return {
|
||
|
|
'id': review_id,
|
||
|
|
'rating': rating,
|
||
|
|
'text': text,
|
||
|
|
'date': date_string,
|
||
|
|
'date_parsed': date_parsed,
|
||
|
|
'reviewer': reviewer,
|
||
|
|
}
|
||
|
|
except Exception as e:
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def filter_painful_reviews(reviews, min_rating=2):
|
||
|
|
"""
|
||
|
|
Filter reviews to only painful ones (low ratings).
|
||
|
|
|
||
|
|
Args:
|
||
|
|
reviews: List of review dictionaries
|
||
|
|
min_rating: Maximum rating to include
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Filtered list
|
||
|
|
"""
|
||
|
|
return [r for r in reviews if r.get('rating', 5) <= min_rating]
|
||
|
|
|
||
|
|
|
||
|
|
def get_recent_reviews(reviews, days=30):
|
||
|
|
"""
|
||
|
|
Filter to only recent reviews.
|
||
|
|
|
||
|
|
Args:
|
||
|
|
reviews: List of review dictionaries
|
||
|
|
days: Number of days to look back
|
||
|
|
|
||
|
|
Returns:
|
||
|
|
Filtered list
|
||
|
|
"""
|
||
|
|
cutoff = datetime.now() - timedelta(days=days)
|
||
|
|
return [
|
||
|
|
r for r in reviews
|
||
|
|
if r.get('date_parsed') and r['date_parsed'] >= cutoff
|
||
|
|
]
|