The idea is to combine structural awareness with semantic intelligence in phases:
Document structure refers to the hierarchical organization of content:
FUNCTION detect_document_structure(document):
structure_elements = []
// 1. Detect Headers
headers = find_headers(document) // H1, H2, H3, Markdown #, ##, ###
// 2. Detect Paragraphs
paragraphs = split_by_double_newlines(document)
// 3. Detect Lists
lists = find_lists(document) // Bullet points, numbered lists
// 4. Detect Special Elements
tables = find_tables(document)
code_blocks = find_code_blocks(document)
quotes = find_quotes(document)
// 5. Build Hierarchy
hierarchy = build_hierarchy(headers, paragraphs, lists, tables, code_blocks)
RETURN hierarchy
END FUNCTION
FUNCTION find_headers(text):
header_patterns = [
r'^#{1,6}\\s+(.+)$', // Markdown headers
r'^([A-Z][A-Z\\s]+)$', // ALL CAPS headers
r'^\\d+\\.\\s+([A-Z].+)$', // Numbered headers like "1. Introduction"
r'^([A-Z][a-z]+.*):$', // Title with colon
r'^(.+)\\n=+$', // Underlined with =
r'^(.+)\\n-+$' // Underlined with -
]
headers = []
FOR each line in text:
FOR each pattern in header_patterns:
IF line matches pattern:
level = determine_header_level(line, pattern)
headers.append({
'text': extracted_title,
'level': level,
'position': line_number,
'type': 'header'
})
RETURN headers
END FUNCTION
fter structural chunking, apply semantic analysis to optimize chunk boundaries and merge/split as needed.
FUNCTION semantic_refinement(initial_chunks, similarity_threshold):
refined_chunks = []
FOR chunk in initial_chunks:
IF chunk.status == "TOO_SMALL":
// Try to merge with semantically similar neighbors
merged_chunk = attempt_semantic_merge(chunk, similarity_threshold)
refined_chunks.append(merged_chunk)
ELSE IF chunk.status == "TOO_LARGE":
// Split using semantic boundaries
split_chunks = semantic_split(chunk, similarity_threshold)
refined_chunks.extend(split_chunks)
ELSE:
// Good size - check semantic coherence
coherence_score = calculate_semantic_coherence(chunk)
IF coherence_score < similarity_threshold:
improved_chunk = improve_chunk_coherence(chunk)
refined_chunks.append(improved_chunk)
ELSE:
refined_chunks.append(chunk)
RETURN refined_chunks
END FUNCTION
Semantic Similarity Calculation
FUNCTION calculate_semantic_similarity(text1, text2):
// Method 1: Embedding-based similarity
embedding1 = generate_embedding(text1)
embedding2 = generate_embedding(text2)
similarity = cosine_similarity(embedding1, embedding2)
RETURN similarity
END FUNCTION
FUNCTION calculate_semantic_coherence(chunk):
sentences = split_into_sentences(chunk.content)
similarities = []
// Calculate pairwise similarities between all sentences
FOR i = 0 to length(sentences) - 1:
FOR j = i + 1 to length(sentences) - 1:
similarity = calculate_semantic_similarity(sentences[i], sentences[j])
similarities.append(similarity)
// Return average similarity as coherence score
coherence = average(similarities)
RETURN coherence
END FUNCTION