Files
tell-me/tools/search.go
2026-01-09 11:39:27 +11:00

223 lines
5.8 KiB
Go

package tools
import (
"context"
"fmt"
"net/http"
"net/url"
"os"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
)
const (
maxRetries = 3
timeout = 60 * time.Second
)
// SearchResult represents a single search result
type SearchResult struct {
Title string `json:"title"`
URL string `json:"url"`
Content string `json:"content"`
}
// WebSearch performs a web search using Startpage
func WebSearch(query string) (string, error) {
ctx, cancel := context.WithTimeout(context.Background(), timeout)
defer cancel()
// Create HTTP client with proxy support
transport := &http.Transport{
Proxy: http.ProxyFromEnvironment,
}
client := &http.Client{
Timeout: timeout,
Transport: transport,
}
// Perform search with retry logic
results, err := searchStartpage(ctx, client, query)
if err != nil {
return "", fmt.Errorf("Startpage search failed: %w", err)
}
// Format results as text
if len(results) == 0 {
return "No results found.", nil
}
var output strings.Builder
fmt.Fprintf(&output, "Found %d results:\n\n", len(results))
for i, result := range results {
if i >= 10 { // Limit to top 10 results
break
}
fmt.Fprintf(&output, "%d. %s\n", i+1, result.Title)
fmt.Fprintf(&output, " URL: %s\n", result.URL)
if result.Content != "" {
fmt.Fprintf(&output, " %s\n", result.Content)
}
fmt.Fprintf(&output, "\n")
}
return output.String(), nil
}
// searchStartpage performs the actual Startpage search
func searchStartpage(ctx context.Context, client *http.Client, query string) ([]SearchResult, error) {
// Build Startpage search URL
searchURL := fmt.Sprintf("https://www.startpage.com/sp/search?query=%s", url.QueryEscape(query))
// Create request with proper headers
req, err := http.NewRequestWithContext(ctx, "GET", searchURL, nil)
if err != nil {
return nil, fmt.Errorf("failed to create search request: %w", err)
}
// Set browser-like user agent
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8")
req.Header.Set("Accept-Language", "en-US,en;q=0.5")
// Execute request with retry logic
resp, err := executeWithRetry(ctx, client, req, "search Startpage")
if err != nil {
return nil, err
}
defer resp.Body.Close()
// Check response status
if resp.StatusCode != http.StatusOK && resp.StatusCode != http.StatusAccepted {
return nil, fmt.Errorf("search request failed with status: %d %s", resp.StatusCode, resp.Status)
}
// Parse HTML response
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
return nil, fmt.Errorf("failed to parse search results: %w", err)
}
// Extract search results from Startpage
var results []SearchResult
// Find all search result links in the "Web results" section
doc.Find("a[href]").Each(func(i int, s *goquery.Selection) {
href, exists := s.Attr("href")
if !exists {
return
}
// Skip internal Startpage links and Anonymous View links
if strings.Contains(href, "startpage.com") ||
strings.Contains(href, "/av/proxy") ||
strings.HasPrefix(href, "#") ||
strings.HasPrefix(href, "/") {
return
}
// Only process HTTP/HTTPS URLs
if !strings.HasPrefix(href, "http://") && !strings.HasPrefix(href, "https://") {
return
}
title := strings.TrimSpace(s.Text())
if title == "" {
return
}
// Skip very short titles (likely navigation or other non-content links)
if len(title) < 10 {
return
}
// Try to find description text near the link
var description string
parent := s.Parent()
for parent.Length() > 0 {
// Look for text content in siblings or parent elements
text := strings.TrimSpace(parent.Text())
if len(text) > len(title)+20 { // Found longer text that includes description
// Extract the part that's not the title
if idx := strings.Index(text, title); idx >= 0 {
remainder := strings.TrimSpace(text[idx+len(title):])
if len(remainder) > 20 { // Good description length
description = remainder
break
}
}
}
parent = parent.Parent()
if parent.Length() == 0 {
break
}
}
// Limit description length
if len(description) > 200 {
description = description[:200] + "..."
}
// Check if we already have this URL (avoid duplicates)
for _, existing := range results {
if existing.URL == href {
return
}
}
results = append(results, SearchResult{
Title: title,
URL: href,
Content: description,
})
})
return results, nil
}
// executeWithRetry executes an HTTP request with exponential backoff retry logic
func executeWithRetry(ctx context.Context, client *http.Client, req *http.Request, operation string) (*http.Response, error) {
var resp *http.Response
var err error
// Retry logic with exponential backoff
for attempt := 0; attempt <= maxRetries; attempt++ {
resp, err = client.Do(req)
if err == nil {
return resp, nil
}
// Don't retry on the last attempt
if attempt == maxRetries {
break
}
// Exponential backoff: 1s, 2s, 4s
backoffDuration := time.Duration(1<<uint(attempt)) * time.Second
select {
case <-ctx.Done():
return nil, fmt.Errorf("%s cancelled: %v", operation, ctx.Err())
case <-time.After(backoffDuration):
// Continue to next retry
}
}
return nil, fmt.Errorf("failed to %s after %d retries: %v", operation, maxRetries, err)
}
// logProxyConfiguration logs the proxy configuration for debugging
func init() {
if httpsProxy := os.Getenv("HTTPS_PROXY"); httpsProxy != "" {
fmt.Printf("Using HTTPS_PROXY: %s\n", httpsProxy)
} else if httpProxy := os.Getenv("HTTP_PROXY"); httpProxy != "" {
fmt.Printf("Using HTTP_PROXY: %s\n", httpProxy)
} else if allProxy := os.Getenv("ALL_PROXY"); allProxy != "" {
fmt.Printf("Using ALL_PROXY: %s\n", allProxy)
}
}