GMB-Scraper/lib/review_scraper.py

228 lines
6.9 KiB
Python
Raw Permalink Normal View History

"""
Review Scraper Module
=====================
Extract reviews from Google Maps business pages.
"""
import re
from datetime import datetime, timedelta
from .logger import get_logger
from .retry import retry_with_backoff
def parse_relative_date(date_string):
"""
Parse relative date strings like "2 days ago", "1 week ago".
Args:
date_string: Relative date string
Returns:
datetime object or None
"""
if not date_string:
return None
now = datetime.now()
date_string = date_string.lower().strip()
# Patterns
patterns = [
(r'(\d+)\s+(second|minute|hour|day|week|month|year)s?\s+ago',
lambda m: {
'seconds': 1, 'minutes': 60, 'hours': 3600,
'days': 86400, 'weeks': 604800, 'months': 2592000,
'years': 31536000
}.get(m.group(2), 0) * int(m.group(1))),
]
for pattern, calc in patterns:
match = re.search(pattern, date_string)
if match:
seconds = calc(match)
return now - timedelta(seconds=seconds)
return None
@retry_with_backoff(max_attempts=2, base_delay=1.0)
def scrape_reviews(page, max_reviews=50, days_back=90):
"""
Scrape reviews from an open Google Maps business page.
Args:
page: Playwright page with business open
max_reviews: Maximum number of reviews to scrape
days_back: Only scrape reviews from last N days (0 = all)
Returns:
List of review dictionaries
"""
logger = get_logger()
reviews = []
cutoff_date = datetime.now() - timedelta(days=days_back) if days_back > 0 else None
try:
# Click "Reviews" tab if not already there
try:
reviews_tab = page.locator('button[aria-label*="Reviews"]').first
if reviews_tab.count() > 0:
reviews_tab.click()
page.wait_for_timeout(1500)
except Exception:
pass
# Scroll to load more reviews
for scroll_iteration in range(20):
# Extract visible reviews
review_elements = page.locator('[data-review-id]').all()
if not review_elements:
# Try alternative selector
review_elements = page.locator('.OD1W0[role="article"], [jsaction*="reviewChart"]').all()
new_count = 0
for element in review_elements:
try:
review = extract_review_data(element)
if review and review['id'] not in [r['id'] for r in reviews]:
reviews.append(review)
new_count += 1
# Check date cutoff
if cutoff_date and review.get('date_parsed'):
if review['date_parsed'] < cutoff_date:
logger.debug(f"Reached cutoff date at review {len(reviews)}")
return reviews[:max_reviews]
if len(reviews) >= max_reviews:
return reviews
except Exception as e:
logger.debug(f"Error extracting review: {e}")
continue
if new_count == 0:
logger.debug(f"No new reviews after {scroll_iteration + 1} scrolls")
break
# Scroll down
try:
page.evaluate("""
const scrollable = document.querySelector('[role="feed"]') ||
document.querySelector('.m6QErb.DxyBCb.kA9KIf.dS8AEf');
if (scrollable) scrollable.scrollBy(0, 1000);
""")
page.wait_for_timeout(1000)
except Exception:
break
logger.info(f"Scraped {len(reviews)} reviews")
return reviews[:max_reviews]
except Exception as e:
logger.warning(f"Failed to scrape reviews: {e}")
return reviews
def extract_review_data(element):
"""
Extract review data from a review element.
Args:
element: Playwright element
Returns:
Dictionary with review data
"""
try:
# Get review ID
review_id = element.get_attribute('data-review-id') or ''
if not review_id:
# Generate pseudo-ID from text
text = element.inner_text()[:50]
review_id = str(hash(text))
# Get rating
rating = 0
try:
rating_el = element.locator('[aria-label*="stars"], [aria-label*="Stars"]').first
if rating_el.count() > 0:
aria = rating_el.get_attribute('aria-label') or ''
match = re.search(r'(\d+)', aria)
if match:
rating = int(match.group(1))
except Exception:
pass
# Get review text
text = ''
try:
text_el = element.locator('[class*="wiI7pd"], [jsaction*="reviewChart"] span').first
if text_el.count() > 0:
text = text_el.inner_text().strip()
except Exception:
pass
# Get date
date_string = ''
date_parsed = None
try:
date_el = element.locator('[class*="rsqaWe"], [class*="review-date"]').first
if date_el.count() > 0:
date_string = date_el.inner_text().strip()
date_parsed = parse_relative_date(date_string)
except Exception:
pass
# Get reviewer name
reviewer = ''
try:
name_el = element.locator('[class*="d4r55"], [class*="reviewer-name"]').first
if name_el.count() > 0:
reviewer = name_el.inner_text().strip()
except Exception:
pass
return {
'id': review_id,
'rating': rating,
'text': text,
'date': date_string,
'date_parsed': date_parsed,
'reviewer': reviewer,
}
except Exception as e:
return None
def filter_painful_reviews(reviews, min_rating=2):
"""
Filter reviews to only painful ones (low ratings).
Args:
reviews: List of review dictionaries
min_rating: Maximum rating to include
Returns:
Filtered list
"""
return [r for r in reviews if r.get('rating', 5) <= min_rating]
def get_recent_reviews(reviews, days=30):
"""
Filter to only recent reviews.
Args:
reviews: List of review dictionaries
days: Number of days to look back
Returns:
Filtered list
"""
cutoff = datetime.now() - timedelta(days=days)
return [
r for r in reviews
if r.get('date_parsed') and r['date_parsed'] >= cutoff
]