#!/usr/bin/env python3
"""
Blue Region Extractor - Phase 5
Extracts bullet points from SEC filing blue regions (#1f497d)
"""
import requests
from bs4 import BeautifulSoup
import re
from typing import List, Dict, Optional
# CONFIGURATION
DEBUG_ENABLED = False # Set to True for verbose debugging
# Pre-compiled regex patterns
WINGDINGS_PATTERN = re.compile(r'wingdings', re.I)
TABLE_CELL_PATTERN = re.compile(r'display:\s*table-cell', re.I)
BULLET_SYMBOLS = ['§', '■', '•', '○', '●', '◦','·','▪', '▫', '◾', '◽', '◼', '◻', '■']
# HELPER FUNCTIONS
def is_blue_background(element) -> bool:
"""Check if element has the target blue background"""
style = element.get('style', '')
bgcolor = element.get('bgcolor', '')
blue_patterns = [
'rgb(31,73,125)',
'rgb(31, 73, 125)',
'#1f497d',
'#1F497D'
]
for pattern in blue_patterns:
if pattern.lower() in style.lower() or pattern.lower() in bgcolor.lower():
return True
return False
def has_bullet_symbol(text: str, check_anywhere=False) -> bool:
"""Check if text contains bullet symbols"""
text = text.strip()
if check_anywhere:
return any(sym in text for sym in BULLET_SYMBOLS)
else:
return any(text.startswith(sym) for sym in BULLET_SYMBOLS)
def remove_symbols(text: str) -> str:
for sym in BULLET_SYMBOLS:
text = text.replace(sym, '', 1)
return text.strip()
def log_debug(message: str):
if DEBUG_ENABLED:
print(f"[DEBUG] {message}")
# EXTRACTION HELPERS
def extract_from_paragraph(p, depth: int, pattern_prefix: str) -> Optional[Dict]:
"""Extract bullet from a single
tag"""
# Check for Wingdings font
wingdings_in_p = p.find_all('font', style=WINGDINGS_PATTERN)
if wingdings_in_p:
p_text = p.get_text(separator=' ', strip=True)
p_text = remove_symbols(p_text)
if len(p_text) > 10:
log_debug(f" {' '*depth} Extracted from
(Wingdings): {p_text[:60]}...")
return {
'text': p_text,
'symbol': '§',
'depth': depth,
'pattern': f'{pattern_prefix}_nested_p'
}
else:
# No Wingdings - check for Unicode symbols
p_text_preview = p.get_text(strip=True)[:50]
if has_bullet_symbol(p_text_preview, check_anywhere=False):
full_p_text = p.get_text(separator=' ', strip=True)
full_p_text = remove_symbols(full_p_text)
if len(full_p_text) > 10:
log_debug(f" {' '*depth} Extracted from
(Unicode): {full_p_text[:60]}...")
return {
'text': full_p_text,
'symbol': p_text_preview[0],
'depth': depth,
'pattern': f'{pattern_prefix}_p_no_wingdings'
}
else:
# Check if parent has symbol and this p has substantive text
parent = p.parent
if parent:
parent_text = parent.get_text(separator=' ', strip=True)
p_only_text = p.get_text(separator=' ', strip=True)
# If parent has symbol somewhere and p has substantive text
if has_bullet_symbol(parent_text, check_anywhere=True) and len(p_only_text) > 20:
# Check if p text is NOT just a title (doesn't have embedded symbols)
if not has_bullet_symbol(p_only_text, check_anywhere=True):
log_debug(f" {' '*depth} ✓ Extracted from
(parent has symbol): {p_only_text[:60]}...")
return {
'text': p_only_text,
'symbol': '•',
'depth': depth,
'pattern': f'{pattern_prefix}_p_parent_symbol'
}
return None
def extract_from_paragraphs(all_p_tags: List, depth: int, pattern_prefix: str) -> List[Dict]:
"""Extract bullets from list of
tags"""
bullets = []
for p in all_p_tags:
bullet = extract_from_paragraph(p, depth, pattern_prefix)
if bullet:
bullets.append(bullet)
return bullets
def extract_from_nested_tables(all_tables: List, depth: int, pattern_prefix: str) -> List[Dict]:
"""Extract bullets from nested
elements"""
bullets = []
for table in all_tables:
table_rows = table.find_all('tr')
for tr in table_rows:
cells = tr.find_all('td')
if len(cells) >= 3:
second_cell = cells[1].get_text(strip=True)
has_wingdings = any('wingdings' in str(tag.get('style', '')).lower()
for tag in cells[1].find_all('font'))
if (has_bullet_symbol(second_cell, check_anywhere=False) or has_wingdings):
bullet_text = cells[2].get_text(separator=' ', strip=True)
if bullet_text and len(bullet_text) > 10:
bullets.append({
'text': bullet_text,
'symbol': second_cell if second_cell else '§',
'depth': depth,
'pattern': f'{pattern_prefix}_nested_table_tr'
})
log_debug(f" {' '*depth} Extracted from nested table: {bullet_text[:60]}...")
return bullets
# MAIN EXTRACTION LOGIC
def extract_text_from_element(element, depth=0) -> List[Dict]:
"""Extract bullet points from element and its children"""
bullets = []
if not is_blue_background(element):
return bullets
element_text = element.get_text(separator=' ', strip=True)[:100]
log_debug(f" {' '*depth}Found blue element: {element.name} | Text: '{element_text}...'")
# Pattern 1: TR with Two or Three Cells
if element.name == 'tr':
cells = element.find_all('td', recursive=False)
log_debug(f" {' '*depth}TR has {len(cells)} cells")
# Check for 3-cell pattern first: | § | Text |
if len(cells) >= 3:
first_cell_text = cells[0].get_text(strip=True)
second_cell_text = cells[1].get_text(strip=True)
# If first cell is empty and second has symbol
if len(first_cell_text) <= 2 and has_bullet_symbol(second_cell_text, check_anywhere=False):
bullet_text = cells[2].get_text(separator=' ', strip=True)
if bullet_text and len(bullet_text) > 3:
bullets.append({
'text': bullet_text,
'symbol': second_cell_text,
'depth': depth,
'pattern': 'tr_three_cells'
})
log_debug(f" {' '*depth} Extracted (TR 3-cell): {bullet_text[:60]}...")
# Check for 2-cell pattern: § | Text |
if len(cells) >= 2 and not bullets: # Only if 3-cell didn't match
first_cell_text = cells[0].get_text(strip=True)
has_symbol_at_start = has_bullet_symbol(first_cell_text, check_anywhere=False)
has_wingdings = any('wingdings' in str(tag.get('style', '')).lower()
for tag in cells[0].find_all('font'))
if has_symbol_at_start or has_wingdings:
bullet_text = cells[1].get_text(separator=' ', strip=True)
if bullet_text:
bullets.append({
'text': bullet_text,
'symbol': first_cell_text if has_symbol_at_start else '§',
'depth': depth,
'pattern': 'tr_two_cells'
})
log_debug(f" {' '*depth} Extracted (TR pattern): {bullet_text[:60]}...")
# Title row detection
tr_text = element.get_text(strip=True)
if has_bullet_symbol(tr_text, check_anywhere=True) and not has_bullet_symbol(tr_text, check_anywhere=False):
log_debug(f" {' '*depth} Title row detected. Searching children...")
all_p_tags = element.find_all('p')
log_debug(f" {' '*depth} Found {len(all_p_tags)} tags")
bullets.extend(extract_from_paragraphs(all_p_tags, depth, 'tr_title'))
# Pattern 2: TD/DIV/P with Title Container
elif element.name in ['td', 'div', 'p', 'li']:
text = element.get_text(separator=' ', strip=True)
# Title container detection
if has_bullet_symbol(text, check_anywhere=True) and not has_bullet_symbol(text, check_anywhere=False):
log_debug(f" {' '*depth} Title {element.name} detected. Searching children...")
all_p_tags = element.find_all('p')
log_debug(f" {' '*depth} Found {len(all_p_tags)}
tags")
# If
tags exist, use them
if all_p_tags:
bullets.extend(extract_from_paragraphs(all_p_tags, depth, f'{element.name}_title'))
else:
# No
tags - extract from raw text (handles symbol-only lines)
log_debug(f" {' '*depth} No
tags, trying direct text extraction...")
full_text = element.get_text(separator='\n', strip=True)
lines = full_text.split('\n')
i = 0
while i < len(lines):
line = lines[i].strip()
if has_bullet_symbol(line, check_anywhere=False):
# Case 1: Symbol + text on same line
if len(line) > 15:
clean_line = remove_symbols(line)
if len(clean_line) > 10:
bullets.append({
'text': clean_line,
'symbol': line[0],
'depth': depth,
'pattern': f'{element.name}_title_direct_text'
})
log_debug(f" {' '*depth} ✓ Extracted (direct text): {clean_line[:60]}...")
# Case 2: Symbol alone, text on next line
elif len(line) <= 5 and i + 1 < len(lines):
next_line = lines[i + 1].strip()
if len(next_line) > 10 and not has_bullet_symbol(next_line, check_anywhere=False):
bullets.append({
'text': next_line,
'symbol': line[0],
'depth': depth,
'pattern': f'{element.name}_title_direct_text'
})
log_debug(f" {' '*depth} ✓ Extracted (symbol + next line): {next_line[:60]}...")
i += 1 # Skip next line
i += 1
all_tables = element.find_all('table')
if all_tables:
log_debug(f" {' '*depth} Found {len(all_tables)} nested
elements")
bullets.extend(extract_from_nested_tables(all_tables, depth, element.name))
# Symbol at start
elif has_bullet_symbol(text, check_anywhere=False):
if len(text.strip()) <= 3:
# Symbol-only cell, check next sibling
next_sibling = element.find_next_sibling(['td', 'div', 'p'])
if next_sibling:
bullet_text = next_sibling.get_text(separator=' ', strip=True)
if bullet_text and len(bullet_text) > 3:
bullets.append({
'text': bullet_text,
'symbol': text.strip(),
'depth': depth,
'pattern': 'symbol_adjacent_sibling'
})
log_debug(f" {' '*depth} Extracted (adjacent sibling): {bullet_text[:60]}...")
else:
# Symbol and text together
bullets.append({
'text': text,
'symbol': text[0],
'depth': depth,
'pattern': 'single_element'
})
log_debug(f" {' '*depth} Extracted (single element): {text[:60]}...")
# Pattern 3: Div Table-Cell Pattern
if not bullets:
table_cell_divs = element.find_all('div', style=TABLE_CELL_PATTERN)
i = 0
while i < len(table_cell_divs):
div = table_cell_divs[i]
text = div.get_text(separator=' ', strip=True)
if has_bullet_symbol(text, check_anywhere=False) and len(text) <= 5:
if i + 1 < len(table_cell_divs):
next_div = table_cell_divs[i + 1]
bullet_text = next_div.get_text(separator=' ', strip=True)
if bullet_text and len(bullet_text) > 10:
bullets.append({
'text': bullet_text,
'symbol': text.strip(),
'depth': depth,
'pattern': 'div_table_cell_within_blue'
})
log_debug(f" {' '*depth} Extracted (div table-cell): {bullet_text[:60]}...")
i += 2
continue
i += 1
# Recursion for nested blue elements
for child in element.find_all(['tr', 'td', 'div', 'table'], recursive=True):
if is_blue_background(child) and child != element:
child_bullets = extract_text_from_element(child, depth + 1)
bullets.extend(child_bullets)
return bullets
# URL SCRAPING
def scrape_url(url: str, debug: bool = False) -> Dict:
global DEBUG_ENABLED
DEBUG_ENABLED = debug
result = {
'url': url,
'status': 'failed',
'bullets': [],
'error': None
}
try:
headers = {
'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36'
}
response = requests.get(url, headers=headers, timeout=30)
response.raise_for_status()
# Parse HTML
soup = BeautifulSoup(response.content, 'html.parser')
# Find all elements with blue background
all_elements = soup.find_all()
blue_elements = [el for el in all_elements if is_blue_background(el)]
if not blue_elements:
result['error'] = 'No blue region found'
return result
# Extract bullets from all blue elements
all_bullets = []
for element in blue_elements:
bullets = extract_text_from_element(element)
all_bullets.extend(bullets)
seen_texts = set()
unique_bullets = []
for bullet in all_bullets:
if bullet['text'] not in seen_texts:
seen_texts.add(bullet['text'])
unique_bullets.append(bullet)
result['bullets'] = unique_bullets
if not unique_bullets:
result['error'] = 'No bullets found in blue region'
else:
result['status'] = 'success'
except requests.RequestException as e:
result['error'] = f"Request failed: {str(e)}"
except Exception as e:
result['error'] = f"Parsing failed: {str(e)}"
return result
def filter_bullets_by_keywords(bullets: List[Dict], keywords: List[str], exclude_keywords: List[str] = None) -> List[str]:
if not bullets or not keywords:
return []
matching_bullets = []
keywords_lower = [kw.lower() for kw in keywords]
exclude_keywords_lower = [kw.lower() for kw in (exclude_keywords or [])]
for bullet in bullets:
text_lower = bullet['text'].lower()
# ML Exclude takes priority - skip if any exclude keyword is found
if exclude_keywords_lower and any(kw in text_lower for kw in exclude_keywords_lower):
continue
# Include if any ML Include keyword is found
if any(kw in text_lower for kw in keywords_lower):
clean_text = re.sub(r'\s+', ' ', bullet['text'].replace('\n', ' ').replace('\r', ' ').replace('\t', ' ')).strip()
matching_bullets.append(clean_text)
return matching_bullets