Initial commit
This commit is contained in:
172
tools/fetch.go
Normal file
172
tools/fetch.go
Normal file
@@ -0,0 +1,172 @@
|
||||
package tools
|
||||
|
||||
import (
|
||||
"bytes"
|
||||
"fmt"
|
||||
"io"
|
||||
"net/http"
|
||||
"strings"
|
||||
"time"
|
||||
|
||||
md "github.com/JohannesKaufmann/html-to-markdown"
|
||||
"github.com/jlubawy/go-boilerpipe"
|
||||
)
|
||||
|
||||
// fetchURL fetches a URL, extracts main content using boilerpipe, and returns clean text
|
||||
func fetchURL(targetURL string) (string, error) {
|
||||
// Validate URL
|
||||
if !strings.HasPrefix(targetURL, "http://") && !strings.HasPrefix(targetURL, "https://") {
|
||||
return "", fmt.Errorf("invalid URL: must start with http:// or https://")
|
||||
}
|
||||
|
||||
// Create HTTP client with timeout
|
||||
client := &http.Client{
|
||||
Timeout: 30 * time.Second,
|
||||
}
|
||||
|
||||
// Make the request
|
||||
resp, err := client.Get(targetURL)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to fetch URL: %w", err)
|
||||
}
|
||||
defer resp.Body.Close()
|
||||
|
||||
if resp.StatusCode != http.StatusOK {
|
||||
return "", fmt.Errorf("request failed with status %d", resp.StatusCode)
|
||||
}
|
||||
|
||||
// Read the response body
|
||||
body, err := io.ReadAll(resp.Body)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to read response: %w", err)
|
||||
}
|
||||
|
||||
// Try to extract article content using boilerpipe
|
||||
doc, err := boilerpipe.ParseDocument(bytes.NewReader(body))
|
||||
if err != nil {
|
||||
// If boilerpipe fails, fall back to markdown conversion
|
||||
return fallbackToMarkdown(targetURL, string(body))
|
||||
}
|
||||
|
||||
// Use ArticlePipeline for best results
|
||||
boilerpipe.ArticlePipeline.Process(doc)
|
||||
|
||||
// Get the extracted content
|
||||
content := doc.Content()
|
||||
|
||||
// If content is too short or empty, fall back to markdown
|
||||
if len(strings.TrimSpace(content)) < 100 {
|
||||
return fallbackToMarkdown(targetURL, string(body))
|
||||
}
|
||||
|
||||
// Build result with title if available
|
||||
var result strings.Builder
|
||||
result.WriteString(fmt.Sprintf("# Content from: %s\n\n", targetURL))
|
||||
|
||||
if doc.Title != "" {
|
||||
result.WriteString(fmt.Sprintf("## %s\n\n", doc.Title))
|
||||
}
|
||||
|
||||
result.WriteString(content)
|
||||
|
||||
return result.String(), nil
|
||||
}
|
||||
|
||||
// fallbackToMarkdown converts HTML to markdown when boilerpipe extraction fails
|
||||
func fallbackToMarkdown(targetURL, htmlContent string) (string, error) {
|
||||
converter := md.NewConverter("", true, nil)
|
||||
markdown, err := converter.ConvertString(htmlContent)
|
||||
if err != nil {
|
||||
return "", fmt.Errorf("failed to convert HTML to Markdown: %w", err)
|
||||
}
|
||||
|
||||
// Clean up the markdown
|
||||
markdown = cleanMarkdown(markdown)
|
||||
|
||||
// Add URL header
|
||||
result := fmt.Sprintf("# Content from: %s\n\n%s", targetURL, markdown)
|
||||
|
||||
return result, nil
|
||||
}
|
||||
|
||||
// cleanMarkdown removes excessive whitespace and limits content length
|
||||
func cleanMarkdown(content string) string {
|
||||
// Remove excessive blank lines (more than 2 consecutive)
|
||||
lines := strings.Split(content, "\n")
|
||||
var cleaned []string
|
||||
blankCount := 0
|
||||
|
||||
for _, line := range lines {
|
||||
line = strings.TrimSpace(line)
|
||||
|
||||
if line == "" {
|
||||
blankCount++
|
||||
if blankCount <= 2 {
|
||||
cleaned = append(cleaned, "")
|
||||
}
|
||||
} else {
|
||||
blankCount = 0
|
||||
cleaned = append(cleaned, line)
|
||||
}
|
||||
}
|
||||
|
||||
content = strings.Join(cleaned, "\n")
|
||||
content = strings.TrimSpace(content)
|
||||
|
||||
// Limit content length to approximately 15k tokens (roughly 60k characters)
|
||||
maxChars := 60000
|
||||
if len(content) > maxChars {
|
||||
content = content[:maxChars] + "\n\n[Content truncated due to length...]"
|
||||
}
|
||||
|
||||
return content
|
||||
}
|
||||
|
||||
// FetchArticles fetches multiple URLs concurrently and combines their content
|
||||
func FetchArticles(urls []string) (string, error) {
|
||||
if len(urls) == 0 {
|
||||
return "", fmt.Errorf("no URLs provided")
|
||||
}
|
||||
|
||||
// Limit to 5 URLs to avoid overwhelming the system
|
||||
if len(urls) > 5 {
|
||||
urls = urls[:5]
|
||||
}
|
||||
|
||||
type result struct {
|
||||
url string
|
||||
content string
|
||||
err error
|
||||
}
|
||||
|
||||
// Fetch URLs concurrently
|
||||
results := make(chan result, len(urls))
|
||||
for _, url := range urls {
|
||||
go func(u string) {
|
||||
content, err := fetchURL(u)
|
||||
results <- result{url: u, content: content, err: err}
|
||||
}(url)
|
||||
}
|
||||
|
||||
// Collect results
|
||||
var combined strings.Builder
|
||||
combined.WriteString("# Combined Content from Multiple Sources\n\n")
|
||||
|
||||
successCount := 0
|
||||
for i := 0; i < len(urls); i++ {
|
||||
res := <-results
|
||||
if res.err != nil {
|
||||
combined.WriteString(fmt.Sprintf("## Failed to fetch: %s\nError: %v\n\n", res.url, res.err))
|
||||
} else {
|
||||
combined.WriteString(res.content)
|
||||
combined.WriteString("\n\n---\n\n")
|
||||
successCount++
|
||||
}
|
||||
}
|
||||
|
||||
if successCount == 0 {
|
||||
return "", fmt.Errorf("failed to fetch any URLs")
|
||||
}
|
||||
|
||||
return combined.String(), nil
|
||||
}
|
||||
Reference in New Issue
Block a user