#!/usr/bin/env python3 """ Blue Region Extractor - Phase 5 Extracts bullet points from SEC filing blue regions (#1f497d) """ import requests from bs4 import BeautifulSoup import re from typing import List, Dict, Optional # CONFIGURATION DEBUG_ENABLED = False # Set to True for verbose debugging # Pre-compiled regex patterns WINGDINGS_PATTERN = re.compile(r'wingdings', re.I) TABLE_CELL_PATTERN = re.compile(r'display:\s*table-cell', re.I) BULLET_SYMBOLS = ['§', '■', '•', '○', '●', '◦','·','▪', '▫', '◾', '◽', '◼', '◻', '■'] # HELPER FUNCTIONS def is_blue_background(element) -> bool: """Check if element has the target blue background""" style = element.get('style', '') bgcolor = element.get('bgcolor', '') blue_patterns = [ 'rgb(31,73,125)', 'rgb(31, 73, 125)', '#1f497d', '#1F497D' ] for pattern in blue_patterns: if pattern.lower() in style.lower() or pattern.lower() in bgcolor.lower(): return True return False def has_bullet_symbol(text: str, check_anywhere=False) -> bool: """Check if text contains bullet symbols""" text = text.strip() if check_anywhere: return any(sym in text for sym in BULLET_SYMBOLS) else: return any(text.startswith(sym) for sym in BULLET_SYMBOLS) def remove_symbols(text: str) -> str: for sym in BULLET_SYMBOLS: text = text.replace(sym, '', 1) return text.strip() def log_debug(message: str): if DEBUG_ENABLED: print(f"[DEBUG] {message}") # EXTRACTION HELPERS def extract_from_paragraph(p, depth: int, pattern_prefix: str) -> Optional[Dict]: """Extract bullet from a single

tag""" # Check for Wingdings font wingdings_in_p = p.find_all('font', style=WINGDINGS_PATTERN) if wingdings_in_p: p_text = p.get_text(separator=' ', strip=True) p_text = remove_symbols(p_text) if len(p_text) > 10: log_debug(f" {' '*depth} Extracted from

(Wingdings): {p_text[:60]}...") return { 'text': p_text, 'symbol': '§', 'depth': depth, 'pattern': f'{pattern_prefix}_nested_p' } else: # No Wingdings - check for Unicode symbols p_text_preview = p.get_text(strip=True)[:50] if has_bullet_symbol(p_text_preview, check_anywhere=False): full_p_text = p.get_text(separator=' ', strip=True) full_p_text = remove_symbols(full_p_text) if len(full_p_text) > 10: log_debug(f" {' '*depth} Extracted from

(Unicode): {full_p_text[:60]}...") return { 'text': full_p_text, 'symbol': p_text_preview[0], 'depth': depth, 'pattern': f'{pattern_prefix}_p_no_wingdings' } else: # Check if parent has symbol and this p has substantive text parent = p.parent if parent: parent_text = parent.get_text(separator=' ', strip=True) p_only_text = p.get_text(separator=' ', strip=True) # If parent has symbol somewhere and p has substantive text if has_bullet_symbol(parent_text, check_anywhere=True) and len(p_only_text) > 20: # Check if p text is NOT just a title (doesn't have embedded symbols) if not has_bullet_symbol(p_only_text, check_anywhere=True): log_debug(f" {' '*depth} ✓ Extracted from

(parent has symbol): {p_only_text[:60]}...") return { 'text': p_only_text, 'symbol': '•', 'depth': depth, 'pattern': f'{pattern_prefix}_p_parent_symbol' } return None def extract_from_paragraphs(all_p_tags: List, depth: int, pattern_prefix: str) -> List[Dict]: """Extract bullets from list of

tags""" bullets = [] for p in all_p_tags: bullet = extract_from_paragraph(p, depth, pattern_prefix) if bullet: bullets.append(bullet) return bullets def extract_from_nested_tables(all_tables: List, depth: int, pattern_prefix: str) -> List[Dict]: """Extract bullets from nested elements""" bullets = [] for table in all_tables: table_rows = table.find_all('tr') for tr in table_rows: cells = tr.find_all('td') if len(cells) >= 3: second_cell = cells[1].get_text(strip=True) has_wingdings = any('wingdings' in str(tag.get('style', '')).lower() for tag in cells[1].find_all('font')) if (has_bullet_symbol(second_cell, check_anywhere=False) or has_wingdings): bullet_text = cells[2].get_text(separator=' ', strip=True) if bullet_text and len(bullet_text) > 10: bullets.append({ 'text': bullet_text, 'symbol': second_cell if second_cell else '§', 'depth': depth, 'pattern': f'{pattern_prefix}_nested_table_tr' }) log_debug(f" {' '*depth} Extracted from nested table: {bullet_text[:60]}...") return bullets # MAIN EXTRACTION LOGIC def extract_text_from_element(element, depth=0) -> List[Dict]: """Extract bullet points from element and its children""" bullets = [] if not is_blue_background(element): return bullets element_text = element.get_text(separator=' ', strip=True)[:100] log_debug(f" {' '*depth}Found blue element: {element.name} | Text: '{element_text}...'") # Pattern 1: TR with Two or Three Cells if element.name == 'tr': cells = element.find_all('td', recursive=False) log_debug(f" {' '*depth}TR has {len(cells)} cells") # Check for 3-cell pattern first: if len(cells) >= 3: first_cell_text = cells[0].get_text(strip=True) second_cell_text = cells[1].get_text(strip=True) # If first cell is empty and second has symbol if len(first_cell_text) <= 2 and has_bullet_symbol(second_cell_text, check_anywhere=False): bullet_text = cells[2].get_text(separator=' ', strip=True) if bullet_text and len(bullet_text) > 3: bullets.append({ 'text': bullet_text, 'symbol': second_cell_text, 'depth': depth, 'pattern': 'tr_three_cells' }) log_debug(f" {' '*depth} Extracted (TR 3-cell): {bullet_text[:60]}...") # Check for 2-cell pattern: if len(cells) >= 2 and not bullets: # Only if 3-cell didn't match first_cell_text = cells[0].get_text(strip=True) has_symbol_at_start = has_bullet_symbol(first_cell_text, check_anywhere=False) has_wingdings = any('wingdings' in str(tag.get('style', '')).lower() for tag in cells[0].find_all('font')) if has_symbol_at_start or has_wingdings: bullet_text = cells[1].get_text(separator=' ', strip=True) if bullet_text: bullets.append({ 'text': bullet_text, 'symbol': first_cell_text if has_symbol_at_start else '§', 'depth': depth, 'pattern': 'tr_two_cells' }) log_debug(f" {' '*depth} Extracted (TR pattern): {bullet_text[:60]}...") # Title row detection tr_text = element.get_text(strip=True) if has_bullet_symbol(tr_text, check_anywhere=True) and not has_bullet_symbol(tr_text, check_anywhere=False): log_debug(f" {' '*depth} Title row detected. Searching children...") all_p_tags = element.find_all('p') log_debug(f" {' '*depth} Found {len(all_p_tags)}

tags") bullets.extend(extract_from_paragraphs(all_p_tags, depth, 'tr_title')) # Pattern 2: TD/DIV/P with Title Container elif element.name in ['td', 'div', 'p', 'li']: text = element.get_text(separator=' ', strip=True) # Title container detection if has_bullet_symbol(text, check_anywhere=True) and not has_bullet_symbol(text, check_anywhere=False): log_debug(f" {' '*depth} Title {element.name} detected. Searching children...") all_p_tags = element.find_all('p') log_debug(f" {' '*depth} Found {len(all_p_tags)}

tags") # If

tags exist, use them if all_p_tags: bullets.extend(extract_from_paragraphs(all_p_tags, depth, f'{element.name}_title')) else: # No

tags - extract from raw text (handles symbol-only lines) log_debug(f" {' '*depth} No

tags, trying direct text extraction...") full_text = element.get_text(separator='\n', strip=True) lines = full_text.split('\n') i = 0 while i < len(lines): line = lines[i].strip() if has_bullet_symbol(line, check_anywhere=False): # Case 1: Symbol + text on same line if len(line) > 15: clean_line = remove_symbols(line) if len(clean_line) > 10: bullets.append({ 'text': clean_line, 'symbol': line[0], 'depth': depth, 'pattern': f'{element.name}_title_direct_text' }) log_debug(f" {' '*depth} ✓ Extracted (direct text): {clean_line[:60]}...") # Case 2: Symbol alone, text on next line elif len(line) <= 5 and i + 1 < len(lines): next_line = lines[i + 1].strip() if len(next_line) > 10 and not has_bullet_symbol(next_line, check_anywhere=False): bullets.append({ 'text': next_line, 'symbol': line[0], 'depth': depth, 'pattern': f'{element.name}_title_direct_text' }) log_debug(f" {' '*depth} ✓ Extracted (symbol + next line): {next_line[:60]}...") i += 1 # Skip next line i += 1 all_tables = element.find_all('table') if all_tables: log_debug(f" {' '*depth} Found {len(all_tables)} nested

§Text§Text
elements") bullets.extend(extract_from_nested_tables(all_tables, depth, element.name)) # Symbol at start elif has_bullet_symbol(text, check_anywhere=False): if len(text.strip()) <= 3: # Symbol-only cell, check next sibling next_sibling = element.find_next_sibling(['td', 'div', 'p']) if next_sibling: bullet_text = next_sibling.get_text(separator=' ', strip=True) if bullet_text and len(bullet_text) > 3: bullets.append({ 'text': bullet_text, 'symbol': text.strip(), 'depth': depth, 'pattern': 'symbol_adjacent_sibling' }) log_debug(f" {' '*depth} Extracted (adjacent sibling): {bullet_text[:60]}...") else: # Symbol and text together bullets.append({ 'text': text, 'symbol': text[0], 'depth': depth, 'pattern': 'single_element' }) log_debug(f" {' '*depth} Extracted (single element): {text[:60]}...") # Pattern 3: Div Table-Cell Pattern if not bullets: table_cell_divs = element.find_all('div', style=TABLE_CELL_PATTERN) i = 0 while i < len(table_cell_divs): div = table_cell_divs[i] text = div.get_text(separator=' ', strip=True) if has_bullet_symbol(text, check_anywhere=False) and len(text) <= 5: if i + 1 < len(table_cell_divs): next_div = table_cell_divs[i + 1] bullet_text = next_div.get_text(separator=' ', strip=True) if bullet_text and len(bullet_text) > 10: bullets.append({ 'text': bullet_text, 'symbol': text.strip(), 'depth': depth, 'pattern': 'div_table_cell_within_blue' }) log_debug(f" {' '*depth} Extracted (div table-cell): {bullet_text[:60]}...") i += 2 continue i += 1 # Recursion for nested blue elements for child in element.find_all(['tr', 'td', 'div', 'table'], recursive=True): if is_blue_background(child) and child != element: child_bullets = extract_text_from_element(child, depth + 1) bullets.extend(child_bullets) return bullets # URL SCRAPING def scrape_url(url: str, debug: bool = False) -> Dict: global DEBUG_ENABLED DEBUG_ENABLED = debug result = { 'url': url, 'status': 'failed', 'bullets': [], 'error': None } try: headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36' } response = requests.get(url, headers=headers, timeout=30) response.raise_for_status() # Parse HTML soup = BeautifulSoup(response.content, 'html.parser') # Find all elements with blue background all_elements = soup.find_all() blue_elements = [el for el in all_elements if is_blue_background(el)] if not blue_elements: result['error'] = 'No blue region found' return result # Extract bullets from all blue elements all_bullets = [] for element in blue_elements: bullets = extract_text_from_element(element) all_bullets.extend(bullets) seen_texts = set() unique_bullets = [] for bullet in all_bullets: if bullet['text'] not in seen_texts: seen_texts.add(bullet['text']) unique_bullets.append(bullet) result['bullets'] = unique_bullets if not unique_bullets: result['error'] = 'No bullets found in blue region' else: result['status'] = 'success' except requests.RequestException as e: result['error'] = f"Request failed: {str(e)}" except Exception as e: result['error'] = f"Parsing failed: {str(e)}" return result def filter_bullets_by_keywords(bullets: List[Dict], keywords: List[str], exclude_keywords: List[str] = None) -> List[str]: if not bullets or not keywords: return [] matching_bullets = [] keywords_lower = [kw.lower() for kw in keywords] exclude_keywords_lower = [kw.lower() for kw in (exclude_keywords or [])] for bullet in bullets: text_lower = bullet['text'].lower() # ML Exclude takes priority - skip if any exclude keyword is found if exclude_keywords_lower and any(kw in text_lower for kw in exclude_keywords_lower): continue # Include if any ML Include keyword is found if any(kw in text_lower for kw in keywords_lower): clean_text = re.sub(r'\s+', ' ', bullet['text'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')).strip() matching_bullets.append(clean_text) return matching_bullets