Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -38,3 +38,4 @@ sanitize_html_fixtures.py
core/testdata/*
.tmp-gocache
.tmpcache/
google/data/geotargets-2026-05-28.csv
13 changes: 6 additions & 7 deletions baidu/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -87,12 +87,6 @@ func (baid *Baidu) Search(ctx context.Context, query core.Query) (results []core
baid = &scoped

baid.logger.Debug("Starting search, query: %+v", query)
defer func() {
if recovered := recover(); recovered != nil {
err = core.RecoverEnginePanicWithContext(ctx, baid.Name(), recovered, baid.logger)
results = nil
}
}()

// Build URL from query struct to open in browser
url, err := BuildURL(query)
Expand Down Expand Up @@ -166,7 +160,12 @@ func (baid *Baidu) waitForParsedSearchResults(ctx context.Context, page *rod.Pag
}
return nil, core.ErrParser
}
return nil, nil
// The page never reached a recognizable state: no result containers, no
// captcha or timeout markers. Baidu hydrates result cards client-side and
// can exceed the selector deadline, so report a timeout rather than a
// successful empty SERP — callers must retry/skip, not trust 0 results.
baid.logger.Debug("No result containers or block markers within selector timeout")
return nil, core.ErrSearchTimeout
}

// SearchImage executes a Baidu image search and returns normalized image
Expand Down
2 changes: 1 addition & 1 deletion baidu/search_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ import (

func TestSearchBaidu(t *testing.T) {
ithelper.RunEngineTests(t, func(b *core.Browser) core.SearchEngine {
return New(*b, core.SearchEngineOptions{})
return New(*b, ithelper.EngineOptions())
})
}
6 changes: 0 additions & 6 deletions baidu/search_raw.go
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,6 @@ func classifyBaiduRawHTML(body []byte) error {

func Search(ctx context.Context, query core.Query) (results []core.SearchResult, err error) {
ctx = core.PrepareEngineContext(ctx, query, "baidu", false)
defer func() {
if recovered := recover(); recovered != nil {
err = core.RecoverEnginePanicWithContext(ctx, "baidu", recovered, nil)
results = nil
}
}()

searchURL, err := BuildURL(query)
if err != nil {
Expand Down
6 changes: 0 additions & 6 deletions bing/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -151,12 +151,6 @@ func (bing *Bing) Search(ctx context.Context, query core.Query) (results []core.
bing = &scoped

bing.logger.Debug("Starting search, query: %+v", query)
defer func() {
if recovered := recover(); recovered != nil {
err = core.RecoverEnginePanicWithContext(ctx, bing.Name(), recovered, bing.logger)
results = nil
}
}()

searchResults := []core.SearchResult{}

Expand Down
2 changes: 1 addition & 1 deletion bing/search_integration_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,6 @@ import (

func TestSearchBing(t *testing.T) {
ithelper.RunEngineTests(t, func(b *core.Browser) core.SearchEngine {
return New(*b, core.SearchEngineOptions{})
return New(*b, ithelper.EngineOptions())
})
}
3 changes: 2 additions & 1 deletion cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
)

const (
version = "0.8.2"
version = "0.8.3"
defaultConfigFilename = "config"
envPrefix = "OPENSERP"
)
Expand Down Expand Up @@ -412,6 +412,7 @@ func setConfigDefaults(v *viper.Viper) {
v.SetDefault("extract.timeout", "20s")
v.SetDefault("extract.max_bytes", 2*1024*1024)
v.SetDefault("extract.max_concurrent", 2)
v.SetDefault("extract.allow_private_networks", false)
// Keep stage2 defaults stable even when config file is absent.
v.SetDefault("resilience.max_retries", 3)
v.SetDefault("resilience.allow_endpoint_fallback", false)
Expand Down
5 changes: 4 additions & 1 deletion cmd/search.go
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,10 @@ func search(cmd *cobra.Command, args []string) {

func searchBrowser(engineType string, query core.Query, browserProxyURL string, captchaSolverEnabled bool, captchaSolverAPIKey string) ([]core.SearchResult, error) {
var engine core.SearchEngine
blockedResourceTypes := core.MustParseBlockedResourceTypes(config.App.BlockResources)
blockedResourceTypes, err := core.ParseBlockedResourceTypes(config.App.BlockResources)
if err != nil {
return nil, fmt.Errorf("invalid block_resources config: %w", err)
}
if core.IsAuthenticatedSocksProxyURL(browserProxyURL) {
return nil, fmt.Errorf(
"%w: browser runtime does not support authenticated SOCKS proxy %s",
Expand Down
33 changes: 21 additions & 12 deletions cmd/serve.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@ import (

// rawEngine implements SearchEngine interface for raw HTTP requests
type rawEngine struct {
name string
name string
limiterMu sync.Mutex
limiter *rate.Limiter
}

func (r *rawEngine) Search(ctx context.Context, q core.Query) ([]core.SearchResult, error) {
Expand Down Expand Up @@ -60,8 +62,13 @@ func (r *rawEngine) IsInitialized() bool {
}

func (r *rawEngine) GetRateLimiter() *rate.Limiter {
// Use default rate limiter for raw requests
return rate.NewLimiter(rate.Every(time.Second), 5)
r.limiterMu.Lock()
defer r.limiterMu.Unlock()
if r.limiter == nil {
// Use default rate limiter for raw requests.
r.limiter = rate.NewLimiter(rate.Every(time.Second), 5)
}
return r.limiter
}

var serveCMD = &cobra.Command{
Expand Down Expand Up @@ -151,6 +158,14 @@ func buildFingerprintBrowserOptions() core.BrowserOpts {
}

func buildServerOptions(corsCfg core.CORSConfig, proxyCfg core.ProxyConfig, fingerprintBrowserOpts core.BrowserOpts) core.ServerOptions {
retryCfg := core.RetryConfig{
MaxRetries: config.Resilience.MaxRetries,
InitialBackoff: 1 * time.Second,
MaxBackoff: 30 * time.Second,
BackoffFactor: 2.0,
}
engineTimeout := time.Duration(config.App.Timeout) * time.Second

return core.ServerOptions{
CacheTTL: time.Duration(config.Cache.TTLSeconds) * time.Second,
CacheMaxSize: config.Cache.MaxSize,
Expand All @@ -161,14 +176,10 @@ func buildServerOptions(corsCfg core.CORSConfig, proxyCfg core.ProxyConfig, fing
FingerprintArtifactDir: core.DefaultFingerprintArtifactDir,
FingerprintBrowserOpts: fingerprintBrowserOpts,
MegaTimeout: config.App.MegaTimeout,
RequestTimeout: core.RequestTimeoutForRetries(engineTimeout, retryCfg),
Extract: config.Extract,
Resilience: core.ResilientConfig{
Retry: core.RetryConfig{
MaxRetries: config.Resilience.MaxRetries,
InitialBackoff: 1 * time.Second,
MaxBackoff: 30 * time.Second,
BackoffFactor: 2.0,
},
Retry: retryCfg,
CircuitBreaker: core.CircuitBreakerConfig{
FailureThreshold: config.CircuitBreaker.Failures,
RecoveryTimeout: time.Duration(config.CircuitBreaker.RecoverySeconds) * time.Second,
Expand Down Expand Up @@ -530,7 +541,6 @@ func (p *browserPool) close() error {

type pooledBrowserEngine struct {
name string
limiter *rate.Limiter
opts core.SearchEngineOptions
factory func(core.Browser, core.SearchEngineOptions) core.SearchEngine
pool *browserPool
Expand Down Expand Up @@ -574,7 +584,7 @@ func (e *pooledBrowserEngine) Name() string {
}

func (e *pooledBrowserEngine) GetRateLimiter() *rate.Limiter {
return e.limiter
return e.opts.GetRateLimiter()
}

func (e *pooledBrowserEngine) DropProxyLaneCookies(ctx context.Context, q core.Query) {
Expand Down Expand Up @@ -699,7 +709,6 @@ func buildBrowserEngines(baseOpts core.BrowserOpts, proxyCfg core.ProxyConfig) (
opts.Init()
base := &pooledBrowserEngine{
name: spec.name,
limiter: rate.NewLimiter(rate.Every(opts.GetRatelimit()), opts.RateBurst),
opts: opts,
factory: spec.factory,
pool: pool,
Expand Down
11 changes: 11 additions & 0 deletions cmd/serve_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,17 @@ import (
"github.com/karust/openserp/core"
)

func TestRawEngineCachesRateLimiter(t *testing.T) {
engine := &rawEngine{name: "google"}
first := engine.GetRateLimiter()
if first == nil {
t.Fatal("expected limiter")
}
if second := engine.GetRateLimiter(); second != first {
t.Fatal("expected rawEngine to return the cached limiter")
}
}

func TestBrowserPoolKey(t *testing.T) {
cases := []struct {
name string
Expand Down
4 changes: 3 additions & 1 deletion config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ server:

app:
log_format: "text" # json|text
timeout: 15 # Browser/search timeout in seconds
timeout: 15 # Browser/search timeout in seconds, per attempt; the request deadline is derived from this x retries
browser_path: "" # Custom browser binary path (chrome/chromium/edge..)
profiles: "" # Overriding built-in browser profiles
head: false # Headful mode
Expand All @@ -18,6 +18,8 @@ app:
idle_ttl: 5m # close a Chrome that has not served traffic for this long

mega_timeout: 90s # max total wait for /mega/* requests; slow engines return partial results
block_trackers: true
block_resources: "image,font,css,media"

extract:
enabled: true
Expand Down
6 changes: 4 additions & 2 deletions core/browser.go
Original file line number Diff line number Diff line change
Expand Up @@ -213,13 +213,15 @@ func (b *Browser) configureRequestBlocking(ctx context.Context, page *rod.Page)

blocked := blockedResourceTypeSet(b.BlockResourceTypes)
router := page.HijackRequests()
router.MustAdd("*", func(h *rod.Hijack) {
if err := router.Add("*", "", func(h *rod.Hijack) {
if _, ok := blocked[h.Request.Type()]; ok {
h.Response.Fail(proto.NetworkErrorReasonBlockedByClient)
return
}
h.ContinueRequest(&proto.FetchContinueRequest{})
})
}); err != nil {
return fmt.Errorf("install resource-blocking route: %w", err)
}

go router.Run()

Expand Down
33 changes: 30 additions & 3 deletions core/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import (
"sort"
"strconv"
"strings"
"sync"
"time"

"github.com/gofiber/fiber/v2"
Expand Down Expand Up @@ -294,6 +295,9 @@ type Query struct {
ProxyOverride string
// Insecure enables insecure TLS for request/browser execution.
Insecure bool
// GuardPrivateNetworks rejects raw HTTP targets that resolve to private,
// loopback, link-local, multicast, or otherwise non-public addresses.
GuardPrivateNetworks bool
}

// String renders Query for logs with the proxy URL credentials masked. The
Expand Down Expand Up @@ -447,8 +451,18 @@ type SearchEngineOptions struct {
// IsSolveCaptcha enables automatic captcha solving when engine support and
// solver credentials are configured.
IsSolveCaptcha bool `mapstructure:"captcha"`

limiterState *rateLimiterState
}

type rateLimiterState struct {
limiter *rate.Limiter
every time.Duration
burst int
}

var searchEngineOptionsLimiterMu sync.Mutex

// Init sets default option values when fields are zero.
func (o *SearchEngineOptions) Init() {
if o.RateRequests == 0 {
Expand All @@ -471,10 +485,23 @@ func (o *SearchEngineOptions) GetRatelimit() time.Duration {
return (time.Duration(o.RateTime) * time.Second) / time.Duration(o.RateRequests)
}

// GetRateLimiter returns a limiter configured from SearchEngineOptions.
// Call Init() first so RateBurst is non-zero.
// GetRateLimiter returns a cached limiter configured from SearchEngineOptions.
// Call Init() first so RateBurst is non-zero. Do not copy SearchEngineOptions
// after first use; the limiter state is intentionally shared by each engine.
func (o *SearchEngineOptions) GetRateLimiter() *rate.Limiter {
return rate.NewLimiter(rate.Every(o.GetRatelimit()), o.RateBurst)
every := o.GetRatelimit()
burst := o.RateBurst
searchEngineOptionsLimiterMu.Lock()
defer searchEngineOptionsLimiterMu.Unlock()
if o.limiterState == nil {
o.limiterState = &rateLimiterState{}
}
if o.limiterState.limiter == nil || o.limiterState.every != every || o.limiterState.burst != burst {
o.limiterState.limiter = rate.NewLimiter(rate.Every(every), burst)
o.limiterState.every = every
o.limiterState.burst = burst
}
return o.limiterState.limiter
}

// GetSelectorTimeout returns the selector wait timeout as time.Duration.
Expand Down
35 changes: 31 additions & 4 deletions core/http_client.go
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,11 @@ func DrainAndCloseResponse(resp *http.Response) {
// derived from the query locale. The caller owns the returned response and
// must drain/close it (see DrainAndCloseResponse).
func RawSearchRequest(ctx context.Context, searchURL string, query Query) (*http.Response, error) {
if query.GuardPrivateNetworks {
if err := ValidatePublicHTTPURL(ctx, searchURL); err != nil {
return nil, err
}
}
client, err := NewRawHTTPClient(query)
if err != nil {
return nil, err
Expand Down Expand Up @@ -96,15 +101,26 @@ func NewRawHTTPClient(query Query) (*http.Client, error) {
roundTripper = proxyErrorTransport{base: roundTripper}
}

return &http.Client{
client := &http.Client{
Transport: roundTripper,
Timeout: rawHTTPTimeout,
}, nil
}
if query.GuardPrivateNetworks {
client.CheckRedirect = func(req *http.Request, via []*http.Request) error {
return ValidatePublicHTTPURL(req.Context(), req.URL.String())
}
}
return client, nil
}

func newRawTransport(query Query) (*http.Transport, error) {
dialContext := dialNetworkUsageConn
if query.GuardPrivateNetworks {
dialContext = guardedDialNetworkUsageConn
}

transport := &http.Transport{
DialContext: dialNetworkUsageConn,
DialContext: dialContext,
}
if query.Insecure {
transport.TLSClientConfig = &tls.Config{InsecureSkipVerify: true}
Expand All @@ -122,12 +138,15 @@ func newRawTransport(query Query) (*http.Transport, error) {

// Keep proxied requests on the standard transport path so SOCKS5/SOCKS5H
// resolution and routing are handled by the configured proxy correctly.
// The extract SSRF guard validates the target URL before the request and
// on redirects; the proxy address itself may legitimately be local.
transport.DialContext = dialNetworkUsageConn
transport.Proxy = http.ProxyURL(parsed)
return transport, nil
}

transport.DialTLSContext = func(ctx context.Context, network, addr string) (net.Conn, error) {
rawConn, err := dialNetworkUsageConn(ctx, network, addr)
rawConn, err := dialContext(ctx, network, addr)
if err != nil {
return nil, err
}
Expand Down Expand Up @@ -175,6 +194,14 @@ func dialNetworkUsageConn(ctx context.Context, network, addr string) (net.Conn,
return networkUsageConn{Conn: conn, ctx: ctx}, nil
}

func guardedDialNetworkUsageConn(ctx context.Context, network, addr string) (net.Conn, error) {
conn, err := GuardedDialContext(ctx, network, addr)
if err != nil {
return nil, err
}
return networkUsageConn{Conn: conn, ctx: ctx}, nil
}

type networkUsageConn struct {
net.Conn
ctx context.Context
Expand Down
Loading
Loading