173 lines
4.2 KiB
Go
173 lines
4.2 KiB
Go
package tools
|
|
|
|
import (
|
|
"bytes"
|
|
"fmt"
|
|
"io"
|
|
"net/http"
|
|
"strings"
|
|
"time"
|
|
|
|
md "github.com/JohannesKaufmann/html-to-markdown"
|
|
"github.com/jlubawy/go-boilerpipe"
|
|
)
|
|
|
|
// fetchURL fetches a URL, extracts main content using boilerpipe, and returns clean text
|
|
func fetchURL(targetURL string) (string, error) {
|
|
// Validate URL
|
|
if !strings.HasPrefix(targetURL, "http://") && !strings.HasPrefix(targetURL, "https://") {
|
|
return "", fmt.Errorf("invalid URL: must start with http:// or https://")
|
|
}
|
|
|
|
// Create HTTP client with timeout
|
|
client := &http.Client{
|
|
Timeout: 30 * time.Second,
|
|
}
|
|
|
|
// Make the request
|
|
resp, err := client.Get(targetURL)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to fetch URL: %w", err)
|
|
}
|
|
defer resp.Body.Close()
|
|
|
|
if resp.StatusCode != http.StatusOK {
|
|
return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
|
|
}
|
|
|
|
// Read the response body
|
|
body, err := io.ReadAll(resp.Body)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to read response: %w", err)
|
|
}
|
|
|
|
// Try to extract article content using boilerpipe
|
|
doc, err := boilerpipe.ParseDocument(bytes.NewReader(body))
|
|
if err != nil {
|
|
// If boilerpipe fails, fall back to markdown conversion
|
|
return fallbackToMarkdown(targetURL, string(body))
|
|
}
|
|
|
|
// Use ArticlePipeline for best results
|
|
boilerpipe.ArticlePipeline.Process(doc)
|
|
|
|
// Get the extracted content
|
|
content := doc.Content()
|
|
|
|
// If content is too short or empty, fall back to markdown
|
|
if len(strings.TrimSpace(content)) < 100 {
|
|
return fallbackToMarkdown(targetURL, string(body))
|
|
}
|
|
|
|
// Build result with title if available
|
|
var result strings.Builder
|
|
result.WriteString(fmt.Sprintf("# Content from: %s\n\n", targetURL))
|
|
|
|
if doc.Title != "" {
|
|
result.WriteString(fmt.Sprintf("## %s\n\n", doc.Title))
|
|
}
|
|
|
|
result.WriteString(content)
|
|
|
|
return result.String(), nil
|
|
}
|
|
|
|
// fallbackToMarkdown converts HTML to markdown when boilerpipe extraction fails
|
|
func fallbackToMarkdown(targetURL, htmlContent string) (string, error) {
|
|
converter := md.NewConverter("", true, nil)
|
|
markdown, err := converter.ConvertString(htmlContent)
|
|
if err != nil {
|
|
return "", fmt.Errorf("failed to convert HTML to Markdown: %w", err)
|
|
}
|
|
|
|
// Clean up the markdown
|
|
markdown = cleanMarkdown(markdown)
|
|
|
|
// Add URL header
|
|
result := fmt.Sprintf("# Content from: %s\n\n%s", targetURL, markdown)
|
|
|
|
return result, nil
|
|
}
|
|
|
|
// cleanMarkdown removes excessive whitespace and limits content length
|
|
func cleanMarkdown(content string) string {
|
|
// Remove excessive blank lines (more than 2 consecutive)
|
|
lines := strings.Split(content, "\n")
|
|
var cleaned []string
|
|
blankCount := 0
|
|
|
|
for _, line := range lines {
|
|
line = strings.TrimSpace(line)
|
|
|
|
if line == "" {
|
|
blankCount++
|
|
if blankCount <= 2 {
|
|
cleaned = append(cleaned, "")
|
|
}
|
|
} else {
|
|
blankCount = 0
|
|
cleaned = append(cleaned, line)
|
|
}
|
|
}
|
|
|
|
content = strings.Join(cleaned, "\n")
|
|
content = strings.TrimSpace(content)
|
|
|
|
// Limit content length to approximately 15k tokens (roughly 60k characters)
|
|
maxChars := 60000
|
|
if len(content) > maxChars {
|
|
content = content[:maxChars] + "\n\n[Content truncated due to length...]"
|
|
}
|
|
|
|
return content
|
|
}
|
|
|
|
// FetchArticles fetches multiple URLs concurrently and combines their content
|
|
func FetchArticles(urls []string) (string, error) {
|
|
if len(urls) == 0 {
|
|
return "", fmt.Errorf("no URLs provided")
|
|
}
|
|
|
|
// Limit to 5 URLs to avoid overwhelming the system
|
|
if len(urls) > 5 {
|
|
urls = urls[:5]
|
|
}
|
|
|
|
type result struct {
|
|
url string
|
|
content string
|
|
err error
|
|
}
|
|
|
|
// Fetch URLs concurrently
|
|
results := make(chan result, len(urls))
|
|
for _, url := range urls {
|
|
go func(u string) {
|
|
content, err := fetchURL(u)
|
|
results <- result{url: u, content: content, err: err}
|
|
}(url)
|
|
}
|
|
|
|
// Collect results
|
|
var combined strings.Builder
|
|
combined.WriteString("# Combined Content from Multiple Sources\n\n")
|
|
|
|
successCount := 0
|
|
for i := 0; i < len(urls); i++ {
|
|
res := <-results
|
|
if res.err != nil {
|
|
combined.WriteString(fmt.Sprintf("## Failed to fetch: %s\nError: %v\n\n", res.url, res.err))
|
|
} else {
|
|
combined.WriteString(res.content)
|
|
combined.WriteString("\n\n---\n\n")
|
|
successCount++
|
|
}
|
|
}
|
|
|
|
if successCount == 0 {
|
|
return "", fmt.Errorf("failed to fetch any URLs")
|
|
}
|
|
|
|
return combined.String(), nil
|
|
}
|