package tools import ( "bytes" "fmt" "io" "net/http" "strings" "time" md "github.com/JohannesKaufmann/html-to-markdown" "github.com/jlubawy/go-boilerpipe" ) // fetchURL fetches a URL, extracts main content using boilerpipe, and returns clean text func fetchURL(targetURL string) (string, error) { // Validate URL if !strings.HasPrefix(targetURL, "http://") && !strings.HasPrefix(targetURL, "https://") { return "", fmt.Errorf("invalid URL: must start with http:// or https://") } // Create HTTP client with timeout client := &http.Client{ Timeout: 30 * time.Second, } // Make the request resp, err := client.Get(targetURL) if err != nil { return "", fmt.Errorf("failed to fetch URL: %w", err) } defer resp.Body.Close() if resp.StatusCode != http.StatusOK { return "", fmt.Errorf("request failed with status %d", resp.StatusCode) } // Read the response body body, err := io.ReadAll(resp.Body) if err != nil { return "", fmt.Errorf("failed to read response: %w", err) } // Try to extract article content using boilerpipe doc, err := boilerpipe.ParseDocument(bytes.NewReader(body)) if err != nil { // If boilerpipe fails, fall back to markdown conversion return fallbackToMarkdown(targetURL, string(body)) } // Use ArticlePipeline for best results boilerpipe.ArticlePipeline.Process(doc) // Get the extracted content content := doc.Content() // If content is too short or empty, fall back to markdown if len(strings.TrimSpace(content)) < 100 { return fallbackToMarkdown(targetURL, string(body)) } // Build result with title if available var result strings.Builder result.WriteString(fmt.Sprintf("# Content from: %s\n\n", targetURL)) if doc.Title != "" { result.WriteString(fmt.Sprintf("## %s\n\n", doc.Title)) } result.WriteString(content) return result.String(), nil } // fallbackToMarkdown converts HTML to markdown when boilerpipe extraction fails func fallbackToMarkdown(targetURL, htmlContent string) (string, error) { converter := md.NewConverter("", true, nil) markdown, err := converter.ConvertString(htmlContent) if err != nil { return "", fmt.Errorf("failed to convert HTML to Markdown: %w", err) } // Clean up the markdown markdown = cleanMarkdown(markdown) // Add URL header result := fmt.Sprintf("# Content from: %s\n\n%s", targetURL, markdown) return result, nil } // cleanMarkdown removes excessive whitespace and limits content length func cleanMarkdown(content string) string { // Remove excessive blank lines (more than 2 consecutive) lines := strings.Split(content, "\n") var cleaned []string blankCount := 0 for _, line := range lines { line = strings.TrimSpace(line) if line == "" { blankCount++ if blankCount <= 2 { cleaned = append(cleaned, "") } } else { blankCount = 0 cleaned = append(cleaned, line) } } content = strings.Join(cleaned, "\n") content = strings.TrimSpace(content) // Limit content length to approximately 15k tokens (roughly 60k characters) maxChars := 60000 if len(content) > maxChars { content = content[:maxChars] + "\n\n[Content truncated due to length...]" } return content } // FetchArticles fetches multiple URLs concurrently and combines their content func FetchArticles(urls []string) (string, error) { if len(urls) == 0 { return "", fmt.Errorf("no URLs provided") } // Limit to 5 URLs to avoid overwhelming the system if len(urls) > 5 { urls = urls[:5] } type result struct { url string content string err error } // Fetch URLs concurrently results := make(chan result, len(urls)) for _, url := range urls { go func(u string) { content, err := fetchURL(u) results <- result{url: u, content: content, err: err} }(url) } // Collect results var combined strings.Builder combined.WriteString("# Combined Content from Multiple Sources\n\n") successCount := 0 for i := 0; i < len(urls); i++ { res := <-results if res.err != nil { combined.WriteString(fmt.Sprintf("## Failed to fetch: %s\nError: %v\n\n", res.url, res.err)) } else { combined.WriteString(res.content) combined.WriteString("\n\n---\n\n") successCount++ } } if successCount == 0 { return "", fmt.Errorf("failed to fetch any URLs") } return combined.String(), nil }