Files
tell-me/tools/fetch.go
Pavel Pivovarov 50a439a499 Initial commit
2025-12-12 11:15:02 +11:00

173 lines
4.2 KiB
Go

package tools
import (
"bytes"
"fmt"
"io"
"net/http"
"strings"
"time"
md "github.com/JohannesKaufmann/html-to-markdown"
"github.com/jlubawy/go-boilerpipe"
)
// fetchURL fetches a URL, extracts main content using boilerpipe, and returns clean text
func fetchURL(targetURL string) (string, error) {
// Validate URL
if !strings.HasPrefix(targetURL, "http://") && !strings.HasPrefix(targetURL, "https://") {
return "", fmt.Errorf("invalid URL: must start with http:// or https://")
}
// Create HTTP client with timeout
client := &http.Client{
Timeout: 30 * time.Second,
}
// Make the request
resp, err := client.Get(targetURL)
if err != nil {
return "", fmt.Errorf("failed to fetch URL: %w", err)
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
}
// Read the response body
body, err := io.ReadAll(resp.Body)
if err != nil {
return "", fmt.Errorf("failed to read response: %w", err)
}
// Try to extract article content using boilerpipe
doc, err := boilerpipe.ParseDocument(bytes.NewReader(body))
if err != nil {
// If boilerpipe fails, fall back to markdown conversion
return fallbackToMarkdown(targetURL, string(body))
}
// Use ArticlePipeline for best results
boilerpipe.ArticlePipeline.Process(doc)
// Get the extracted content
content := doc.Content()
// If content is too short or empty, fall back to markdown
if len(strings.TrimSpace(content)) < 100 {
return fallbackToMarkdown(targetURL, string(body))
}
// Build result with title if available
var result strings.Builder
result.WriteString(fmt.Sprintf("# Content from: %s\n\n", targetURL))
if doc.Title != "" {
result.WriteString(fmt.Sprintf("## %s\n\n", doc.Title))
}
result.WriteString(content)
return result.String(), nil
}
// fallbackToMarkdown converts HTML to markdown when boilerpipe extraction fails
func fallbackToMarkdown(targetURL, htmlContent string) (string, error) {
converter := md.NewConverter("", true, nil)
markdown, err := converter.ConvertString(htmlContent)
if err != nil {
return "", fmt.Errorf("failed to convert HTML to Markdown: %w", err)
}
// Clean up the markdown
markdown = cleanMarkdown(markdown)
// Add URL header
result := fmt.Sprintf("# Content from: %s\n\n%s", targetURL, markdown)
return result, nil
}
// cleanMarkdown removes excessive whitespace and limits content length
func cleanMarkdown(content string) string {
// Remove excessive blank lines (more than 2 consecutive)
lines := strings.Split(content, "\n")
var cleaned []string
blankCount := 0
for _, line := range lines {
line = strings.TrimSpace(line)
if line == "" {
blankCount++
if blankCount <= 2 {
cleaned = append(cleaned, "")
}
} else {
blankCount = 0
cleaned = append(cleaned, line)
}
}
content = strings.Join(cleaned, "\n")
content = strings.TrimSpace(content)
// Limit content length to approximately 15k tokens (roughly 60k characters)
maxChars := 60000
if len(content) > maxChars {
content = content[:maxChars] + "\n\n[Content truncated due to length...]"
}
return content
}
// FetchArticles fetches multiple URLs concurrently and combines their content
func FetchArticles(urls []string) (string, error) {
if len(urls) == 0 {
return "", fmt.Errorf("no URLs provided")
}
// Limit to 5 URLs to avoid overwhelming the system
if len(urls) > 5 {
urls = urls[:5]
}
type result struct {
url string
content string
err error
}
// Fetch URLs concurrently
results := make(chan result, len(urls))
for _, url := range urls {
go func(u string) {
content, err := fetchURL(u)
results <- result{url: u, content: content, err: err}
}(url)
}
// Collect results
var combined strings.Builder
combined.WriteString("# Combined Content from Multiple Sources\n\n")
successCount := 0
for i := 0; i < len(urls); i++ {
res := <-results
if res.err != nil {
combined.WriteString(fmt.Sprintf("## Failed to fetch: %s\nError: %v\n\n", res.url, res.err))
} else {
combined.WriteString(res.content)
combined.WriteString("\n\n---\n\n")
successCount++
}
}
if successCount == 0 {
return "", fmt.Errorf("failed to fetch any URLs")
}
return combined.String(), nil
}