#!/usr/bin/env python3
"""
Blue Region Extractor - Phase 5
Extracts bullet points from SEC filing blue regions (#1f497d)
"""

import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict, Optional

# CONFIGURATION
DEBUG_ENABLED = False  # Set to True for verbose debugging

# Pre-compiled regex patterns
WINGDINGS_PATTERN = re.compile(r'wingdings', re.I)
TABLE_CELL_PATTERN = re.compile(r'display:\s*table-cell', re.I)

BULLET_SYMBOLS = ['§', '■', '•', '○', '●', '◦','·','▪', '▫', '◾', '◽', '◼', '◻', '￭']

# HELPER FUNCTIONS
def is_blue_background(element) -> bool:
    """Check if element has the target blue background"""
    style = element.get('style', '')
    bgcolor = element.get('bgcolor', '')
    
    blue_patterns = [
        'rgb(31,73,125)',
        'rgb(31, 73, 125)',
        '#1f497d',
        '#1F497D'
    ]
    
    for pattern in blue_patterns:
        if pattern.lower() in style.lower() or pattern.lower() in bgcolor.lower():
            return True
    return False

def has_bullet_symbol(text: str, check_anywhere=False) -> bool:
    """Check if text contains bullet symbols"""
    text = text.strip()
    
    if check_anywhere:
        return any(sym in text for sym in BULLET_SYMBOLS)
    else:
        return any(text.startswith(sym) for sym in BULLET_SYMBOLS)

def remove_symbols(text: str) -> str:
    for sym in BULLET_SYMBOLS:
        text = text.replace(sym, '', 1)
    return text.strip()

def log_debug(message: str):
    if DEBUG_ENABLED:
        print(f"[DEBUG] {message}")

# EXTRACTION HELPERS
def extract_from_paragraph(p, depth: int, pattern_prefix: str) -> Optional[Dict]:
    """Extract bullet from a single <p> tag"""
    # Check for Wingdings font
    wingdings_in_p = p.find_all('font', style=WINGDINGS_PATTERN)
    
    if wingdings_in_p:
        p_text = p.get_text(separator=' ', strip=True)
        p_text = remove_symbols(p_text)
        
        if len(p_text) > 10:
            log_debug(f"  {'  '*depth}   Extracted from <p> (Wingdings): {p_text[:60]}...")
            return {
                'text': p_text,
                'symbol': '§',
                'depth': depth,
                'pattern': f'{pattern_prefix}_nested_p'
            }
    else:
        # No Wingdings - check for Unicode symbols
        p_text_preview = p.get_text(strip=True)[:50]
        
        if has_bullet_symbol(p_text_preview, check_anywhere=False):
            full_p_text = p.get_text(separator=' ', strip=True)
            full_p_text = remove_symbols(full_p_text)
            
            if len(full_p_text) > 10:
                log_debug(f"  {'  '*depth}   Extracted from <p> (Unicode): {full_p_text[:60]}...")
                return {
                    'text': full_p_text,
                    'symbol': p_text_preview[0],
                    'depth': depth,
                    'pattern': f'{pattern_prefix}_p_no_wingdings'
                }
        else:
            # Check if parent has symbol and this p has substantive text
            parent = p.parent
            if parent:
                parent_text = parent.get_text(separator=' ', strip=True)
                p_only_text = p.get_text(separator=' ', strip=True)
                
                # If parent has symbol somewhere and p has substantive text
                if has_bullet_symbol(parent_text, check_anywhere=True) and len(p_only_text) > 20:
                    # Check if p text is NOT just a title (doesn't have embedded symbols)
                    if not has_bullet_symbol(p_only_text, check_anywhere=True):
                        log_debug(f"  {'  '*depth}  ✓ Extracted from <p> (parent has symbol): {p_only_text[:60]}...")
                        return {
                            'text': p_only_text,
                            'symbol': '•',
                            'depth': depth,
                            'pattern': f'{pattern_prefix}_p_parent_symbol'
                        }
    
    return None

def extract_from_paragraphs(all_p_tags: List, depth: int, pattern_prefix: str) -> List[Dict]:
    """Extract bullets from list of <p> tags"""
    bullets = []
    
    for p in all_p_tags:
        bullet = extract_from_paragraph(p, depth, pattern_prefix)
        if bullet:
            bullets.append(bullet)
    
    return bullets

def extract_from_nested_tables(all_tables: List, depth: int, pattern_prefix: str) -> List[Dict]:
    """Extract bullets from nested <table> elements"""
    bullets = []
    
    for table in all_tables:
        table_rows = table.find_all('tr')
        for tr in table_rows:
            cells = tr.find_all('td')
            if len(cells) >= 3:
                second_cell = cells[1].get_text(strip=True)
                
                has_wingdings = any('wingdings' in str(tag.get('style', '')).lower()
                                  for tag in cells[1].find_all('font'))
                
                if (has_bullet_symbol(second_cell, check_anywhere=False) or has_wingdings):
                    bullet_text = cells[2].get_text(separator=' ', strip=True)
                    if bullet_text and len(bullet_text) > 10:
                        bullets.append({
                            'text': bullet_text,
                            'symbol': second_cell if second_cell else '§',
                            'depth': depth,
                            'pattern': f'{pattern_prefix}_nested_table_tr'
                        })
                        log_debug(f"  {'  '*depth}   Extracted from nested table: {bullet_text[:60]}...")
    
    return bullets

# MAIN EXTRACTION LOGIC

def extract_text_from_element(element, depth=0) -> List[Dict]:
    """Extract bullet points from element and its children"""
    bullets = []
    
    if not is_blue_background(element):
        return bullets
    
    element_text = element.get_text(separator=' ', strip=True)[:100]
    log_debug(f"  {'  '*depth}Found blue element: {element.name} | Text: '{element_text}...'")
    
    # Pattern 1: TR with Two or Three Cells
    if element.name == 'tr':
        cells = element.find_all('td', recursive=False)
        log_debug(f"  {'  '*depth}TR has {len(cells)} cells")
        
        # Check for 3-cell pattern first: <td></td><td>§</td><td>Text</td>
        if len(cells) >= 3:
            first_cell_text = cells[0].get_text(strip=True)
            second_cell_text = cells[1].get_text(strip=True)
            
            # If first cell is empty and second has symbol
            if len(first_cell_text) <= 2 and has_bullet_symbol(second_cell_text, check_anywhere=False):
                bullet_text = cells[2].get_text(separator=' ', strip=True)
                if bullet_text and len(bullet_text) > 3:
                    bullets.append({
                        'text': bullet_text,
                        'symbol': second_cell_text,
                        'depth': depth,
                        'pattern': 'tr_three_cells'
                    })
                    log_debug(f"  {'  '*depth} Extracted (TR 3-cell): {bullet_text[:60]}...")
        
        # Check for 2-cell pattern: <td>§</td><td>Text</td>
        if len(cells) >= 2 and not bullets:  # Only if 3-cell didn't match
            first_cell_text = cells[0].get_text(strip=True)
            has_symbol_at_start = has_bullet_symbol(first_cell_text, check_anywhere=False)
            has_wingdings = any('wingdings' in str(tag.get('style', '')).lower()
                              for tag in cells[0].find_all('font'))
            
            if has_symbol_at_start or has_wingdings:
                bullet_text = cells[1].get_text(separator=' ', strip=True)
                if bullet_text:
                    bullets.append({
                        'text': bullet_text,
                        'symbol': first_cell_text if has_symbol_at_start else '§',
                        'depth': depth,
                        'pattern': 'tr_two_cells'
                    })
                    log_debug(f"  {'  '*depth} Extracted (TR pattern): {bullet_text[:60]}...")
        
        # Title row detection
        tr_text = element.get_text(strip=True)
        if has_bullet_symbol(tr_text, check_anywhere=True) and not has_bullet_symbol(tr_text, check_anywhere=False):
            log_debug(f"  {'  '*depth} Title row detected. Searching children...")
            
            all_p_tags = element.find_all('p')
            log_debug(f"  {'  '*depth}  Found {len(all_p_tags)} <p> tags")
            bullets.extend(extract_from_paragraphs(all_p_tags, depth, 'tr_title'))
    
    # Pattern 2: TD/DIV/P with Title Container
    elif element.name in ['td', 'div', 'p', 'li']:
        text = element.get_text(separator=' ', strip=True)
        
        # Title container detection
        if has_bullet_symbol(text, check_anywhere=True) and not has_bullet_symbol(text, check_anywhere=False):
            log_debug(f"  {'  '*depth} Title {element.name} detected. Searching children...")
            
            all_p_tags = element.find_all('p')
            log_debug(f"  {'  '*depth}  Found {len(all_p_tags)} <p> tags")
            
            # If <p> tags exist, use them
            if all_p_tags:
                bullets.extend(extract_from_paragraphs(all_p_tags, depth, f'{element.name}_title'))
            else:
                # No <p> tags - extract from raw text (handles symbol-only lines)
                log_debug(f"  {'  '*depth}  No <p> tags, trying direct text extraction...")
                
                full_text = element.get_text(separator='\n', strip=True)
                lines = full_text.split('\n')
                
                i = 0
                while i < len(lines):
                    line = lines[i].strip()
                    
                    if has_bullet_symbol(line, check_anywhere=False):
                        # Case 1: Symbol + text on same line
                        if len(line) > 15:
                            clean_line = remove_symbols(line)
                            if len(clean_line) > 10:
                                bullets.append({
                                    'text': clean_line,
                                    'symbol': line[0],
                                    'depth': depth,
                                    'pattern': f'{element.name}_title_direct_text'
                                })
                                log_debug(f"  {'  '*depth}  ✓ Extracted (direct text): {clean_line[:60]}...")
                        
                        # Case 2: Symbol alone, text on next line
                        elif len(line) <= 5 and i + 1 < len(lines):
                            next_line = lines[i + 1].strip()
                            
                            if len(next_line) > 10 and not has_bullet_symbol(next_line, check_anywhere=False):
                                bullets.append({
                                    'text': next_line,
                                    'symbol': line[0],
                                    'depth': depth,
                                    'pattern': f'{element.name}_title_direct_text'
                                })
                                log_debug(f"  {'  '*depth}  ✓ Extracted (symbol + next line): {next_line[:60]}...")
                                i += 1  # Skip next line
                    
                    i += 1
            
            all_tables = element.find_all('table')
            if all_tables:
                log_debug(f"  {'  '*depth}  Found {len(all_tables)} nested <table> elements")
                bullets.extend(extract_from_nested_tables(all_tables, depth, element.name))
        
        # Symbol at start
        elif has_bullet_symbol(text, check_anywhere=False):
            if len(text.strip()) <= 3:
                # Symbol-only cell, check next sibling
                next_sibling = element.find_next_sibling(['td', 'div', 'p'])
                if next_sibling:
                    bullet_text = next_sibling.get_text(separator=' ', strip=True)
                    if bullet_text and len(bullet_text) > 3:
                        bullets.append({
                            'text': bullet_text,
                            'symbol': text.strip(),
                            'depth': depth,
                            'pattern': 'symbol_adjacent_sibling'
                        })
                        log_debug(f"  {'  '*depth} Extracted (adjacent sibling): {bullet_text[:60]}...")
            else:
                # Symbol and text together
                bullets.append({
                    'text': text,
                    'symbol': text[0],
                    'depth': depth,
                    'pattern': 'single_element'
                })
                log_debug(f"  {'  '*depth} Extracted (single element): {text[:60]}...")
    
    # Pattern 3: Div Table-Cell Pattern
    if not bullets:
        table_cell_divs = element.find_all('div', style=TABLE_CELL_PATTERN)
        i = 0
        while i < len(table_cell_divs):
            div = table_cell_divs[i]
            text = div.get_text(separator=' ', strip=True)
            
            if has_bullet_symbol(text, check_anywhere=False) and len(text) <= 5:
                if i + 1 < len(table_cell_divs):
                    next_div = table_cell_divs[i + 1]
                    bullet_text = next_div.get_text(separator=' ', strip=True)
                    if bullet_text and len(bullet_text) > 10:
                        bullets.append({
                            'text': bullet_text,
                            'symbol': text.strip(),
                            'depth': depth,
                            'pattern': 'div_table_cell_within_blue'
                        })
                        log_debug(f"  {'  '*depth} Extracted (div table-cell): {bullet_text[:60]}...")
                        i += 2
                        continue
            i += 1
    
    # Recursion for nested blue elements
    for child in element.find_all(['tr', 'td', 'div', 'table'], recursive=True):
        if is_blue_background(child) and child != element:
            child_bullets = extract_text_from_element(child, depth + 1)
            bullets.extend(child_bullets)
    
    return bullets

# URL SCRAPING
def scrape_url(url: str, debug: bool = False) -> Dict:

    global DEBUG_ENABLED
    DEBUG_ENABLED = debug
    
    result = {
        'url': url,
        'status': 'failed',
        'bullets': [],
        'error': None
    }
    
    try:
        headers = {
            'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
        }
        
        response = requests.get(url, headers=headers, timeout=30)
        response.raise_for_status()
        
        # Parse HTML
        soup = BeautifulSoup(response.content, 'html.parser')
        
        # Find all elements with blue background
        all_elements = soup.find_all()
        blue_elements = [el for el in all_elements if is_blue_background(el)]
        
        if not blue_elements:
            result['error'] = 'No blue region found'
            return result
        
        # Extract bullets from all blue elements
        all_bullets = []
        for element in blue_elements:
            bullets = extract_text_from_element(element)
            all_bullets.extend(bullets)
        
        seen_texts = set()
        unique_bullets = []
        for bullet in all_bullets:
            if bullet['text'] not in seen_texts:
                seen_texts.add(bullet['text'])
                unique_bullets.append(bullet)
        
        result['bullets'] = unique_bullets
        
        if not unique_bullets:
            result['error'] = 'No bullets found in blue region'
        else:
            result['status'] = 'success'
    
    except requests.RequestException as e:
        result['error'] = f"Request failed: {str(e)}"
    except Exception as e:
        result['error'] = f"Parsing failed: {str(e)}"
    
    return result

def filter_bullets_by_keywords(bullets: List[Dict], keywords: List[str], exclude_keywords: List[str] = None) -> List[str]:
    if not bullets or not keywords:
        return []
    
    matching_bullets = []
    keywords_lower = [kw.lower() for kw in keywords]
    exclude_keywords_lower = [kw.lower() for kw in (exclude_keywords or [])]
    
    for bullet in bullets:
        text_lower = bullet['text'].lower()
        
        # ML Exclude takes priority - skip if any exclude keyword is found
        if exclude_keywords_lower and any(kw in text_lower for kw in exclude_keywords_lower):
            continue
        
        # Include if any ML Include keyword is found
        if any(kw in text_lower for kw in keywords_lower):
            clean_text = re.sub(r'\s+', ' ', bullet['text'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')).strip()
            matching_bullets.append(clean_text)
    
    return matching_bullets