""" Review Scraper Module ===================== Extract reviews from Google Maps business pages. """ import re from datetime import datetime, timedelta from .logger import get_logger from .retry import retry_with_backoff def parse_relative_date(date_string): """ Parse relative date strings like "2 days ago", "1 week ago". Args: date_string: Relative date string Returns: datetime object or None """ if not date_string: return None now = datetime.now() date_string = date_string.lower().strip() # Patterns patterns = [ (r'(\d+)\s+(second|minute|hour|day|week|month|year)s?\s+ago', lambda m: { 'seconds': 1, 'minutes': 60, 'hours': 3600, 'days': 86400, 'weeks': 604800, 'months': 2592000, 'years': 31536000 }.get(m.group(2), 0) * int(m.group(1))), ] for pattern, calc in patterns: match = re.search(pattern, date_string) if match: seconds = calc(match) return now - timedelta(seconds=seconds) return None @retry_with_backoff(max_attempts=2, base_delay=1.0) def scrape_reviews(page, max_reviews=50, days_back=90): """ Scrape reviews from an open Google Maps business page. Args: page: Playwright page with business open max_reviews: Maximum number of reviews to scrape days_back: Only scrape reviews from last N days (0 = all) Returns: List of review dictionaries """ logger = get_logger() reviews = [] cutoff_date = datetime.now() - timedelta(days=days_back) if days_back > 0 else None try: # Click "Reviews" tab if not already there try: reviews_tab = page.locator('button[aria-label*="Reviews"]').first if reviews_tab.count() > 0: reviews_tab.click() page.wait_for_timeout(1500) except Exception: pass # Scroll to load more reviews for scroll_iteration in range(20): # Extract visible reviews review_elements = page.locator('[data-review-id]').all() if not review_elements: # Try alternative selector review_elements = page.locator('.OD1W0[role="article"], [jsaction*="reviewChart"]').all() new_count = 0 for element in review_elements: try: review = extract_review_data(element) if review and review['id'] not in [r['id'] for r in reviews]: reviews.append(review) new_count += 1 # Check date cutoff if cutoff_date and review.get('date_parsed'): if review['date_parsed'] < cutoff_date: logger.debug(f"Reached cutoff date at review {len(reviews)}") return reviews[:max_reviews] if len(reviews) >= max_reviews: return reviews except Exception as e: logger.debug(f"Error extracting review: {e}") continue if new_count == 0: logger.debug(f"No new reviews after {scroll_iteration + 1} scrolls") break # Scroll down try: page.evaluate(""" const scrollable = document.querySelector('[role="feed"]') || document.querySelector('.m6QErb.DxyBCb.kA9KIf.dS8AEf'); if (scrollable) scrollable.scrollBy(0, 1000); """) page.wait_for_timeout(1000) except Exception: break logger.info(f"Scraped {len(reviews)} reviews") return reviews[:max_reviews] except Exception as e: logger.warning(f"Failed to scrape reviews: {e}") return reviews def extract_review_data(element): """ Extract review data from a review element. Args: element: Playwright element Returns: Dictionary with review data """ try: # Get review ID review_id = element.get_attribute('data-review-id') or '' if not review_id: # Generate pseudo-ID from text text = element.inner_text()[:50] review_id = str(hash(text)) # Get rating rating = 0 try: rating_el = element.locator('[aria-label*="stars"], [aria-label*="Stars"]').first if rating_el.count() > 0: aria = rating_el.get_attribute('aria-label') or '' match = re.search(r'(\d+)', aria) if match: rating = int(match.group(1)) except Exception: pass # Get review text text = '' try: text_el = element.locator('[class*="wiI7pd"], [jsaction*="reviewChart"] span').first if text_el.count() > 0: text = text_el.inner_text().strip() except Exception: pass # Get date date_string = '' date_parsed = None try: date_el = element.locator('[class*="rsqaWe"], [class*="review-date"]').first if date_el.count() > 0: date_string = date_el.inner_text().strip() date_parsed = parse_relative_date(date_string) except Exception: pass # Get reviewer name reviewer = '' try: name_el = element.locator('[class*="d4r55"], [class*="reviewer-name"]').first if name_el.count() > 0: reviewer = name_el.inner_text().strip() except Exception: pass return { 'id': review_id, 'rating': rating, 'text': text, 'date': date_string, 'date_parsed': date_parsed, 'reviewer': reviewer, } except Exception as e: return None def filter_painful_reviews(reviews, min_rating=2): """ Filter reviews to only painful ones (low ratings). Args: reviews: List of review dictionaries min_rating: Maximum rating to include Returns: Filtered list """ return [r for r in reviews if r.get('rating', 5) <= min_rating] def get_recent_reviews(reviews, days=30): """ Filter to only recent reviews. Args: reviews: List of review dictionaries days: Number of days to look back Returns: Filtered list """ cutoff = datetime.now() - timedelta(days=days) return [ r for r in reviews if r.get('date_parsed') and r['date_parsed'] >= cutoff ]