diff --git a/posthog/__init__.py b/posthog/__init__.py index 871b3ceb..4dd9f610 100644 --- a/posthog/__init__.py +++ b/posthog/__init__.py @@ -67,6 +67,7 @@ FlagDefinitionCacheProvider as FlagDefinitionCacheProvider, ) from posthog.request import ( + DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES as DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES, disable_connection_reuse as disable_connection_reuse, enable_keep_alive as enable_keep_alive, set_socket_options as set_socket_options, @@ -322,6 +323,9 @@ def get_tags() -> Dict[str, Any]: attributed to the person normally. feature_flags_request_timeout_seconds: Timeout in seconds for feature flag and remote config requests. + feature_flags_request_max_retries: Number of retries for feature flag + requests after network, transport, or timeout failures. Defaults to 1. + Set to 0 to disable retries. super_properties: Properties merged into every captured event. enable_exception_autocapture: Automatically capture uncaught exceptions. log_captured_exceptions: Also log exceptions captured by error tracking. @@ -365,6 +369,7 @@ def get_tags() -> Dict[str, Any]: disable_geoip = True # type: bool is_server = True # type: bool feature_flags_request_timeout_seconds = 3 # type: int +feature_flags_request_max_retries = DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES # type: int super_properties = None # type: Optional[Dict] enable_exception_autocapture = False # type: bool log_captured_exceptions = False # type: bool @@ -1156,6 +1161,7 @@ def setup() -> Client: disable_geoip=disable_geoip, is_server=is_server, feature_flags_request_timeout_seconds=feature_flags_request_timeout_seconds, + feature_flags_request_max_retries=feature_flags_request_max_retries, super_properties=super_properties, # TODO: Currently this monitoring begins only when the Client is initialised (which happens when you do something with the SDK) # This kind of initialisation is very annoying for exception capture. We need to figure out a way around this, diff --git a/posthog/client.py b/posthog/client.py index 1fbff223..67772e4b 100644 --- a/posthog/client.py +++ b/posthog/client.py @@ -69,6 +69,7 @@ AI_EVENTS_ENDPOINT, EVENTS_ENDPOINT, APIError, + DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES, QuotaLimitError, RequestsConnectionError, RequestsTimeout, @@ -240,6 +241,7 @@ def __init__( is_server=True, historical_migration=False, feature_flags_request_timeout_seconds=3, + feature_flags_request_max_retries=DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES, super_properties=None, enable_exception_autocapture=False, log_captured_exceptions=False, @@ -296,6 +298,9 @@ def __init__( historical_migration: Mark events as historical migration imports. feature_flags_request_timeout_seconds: Timeout in seconds for feature flag and remote config requests. + feature_flags_request_max_retries: Number of retries for feature flag + requests after network, transport, or timeout failures. Defaults + to 1. Set to 0 to disable retries. super_properties: Properties merged into every captured event. enable_exception_autocapture: Automatically capture uncaught exceptions. @@ -376,6 +381,9 @@ def __init__( self.feature_flags_request_timeout_seconds = ( feature_flags_request_timeout_seconds ) + self.feature_flags_request_max_retries = max( + 0, feature_flags_request_max_retries + ) self.poller: Optional[Poller] = None self.distinct_ids_feature_flags_reported = SizeLimitedDict(MAX_DICT_SIZE, set) self.flag_fallback_cache_url = flag_fallback_cache_url @@ -895,10 +903,18 @@ def _get_flags_decision( if flag_keys_to_evaluate: request_data["flag_keys_to_evaluate"] = flag_keys_to_evaluate + flag_request_options: Dict[str, Any] = {} + if ( + self.feature_flags_request_max_retries + != DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES + ): + flag_request_options["max_retries"] = self.feature_flags_request_max_retries + resp_data = flags( self.api_key, self.host, timeout=self.feature_flags_request_timeout_seconds, + **flag_request_options, **request_data, ) diff --git a/posthog/request.py b/posthog/request.py index de16f33a..12d63677 100644 --- a/posthog/request.py +++ b/posthog/request.py @@ -2,6 +2,7 @@ import logging import re import socket +import time from dataclasses import dataclass from datetime import date, datetime, timezone from gzip import GzipFile @@ -41,8 +42,8 @@ if hasattr(socket, attr): KEEP_ALIVE_SOCKET_OPTIONS.append((socket.SOL_TCP, getattr(socket, attr), value)) -# Status codes that indicate transient server errors worth retrying -RETRY_STATUS_FORCELIST = [408, 500, 502, 503, 504] +DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES = 1 +FEATURE_FLAGS_RETRY_BACKOFF_SECONDS = 0.3 def _mask_tokens_in_url(url: str) -> str: @@ -90,23 +91,14 @@ def _build_session(socket_options: Optional[SocketOptions] = None) -> requests.S def _build_flags_session( socket_options: Optional[SocketOptions] = None, ) -> requests.Session: - """ - Build a session for feature flag requests with POST retries. + """Build a session for feature flag requests. - Feature flag requests are idempotent (read-only), so retrying POST - requests is safe. This session retries on transient server errors - (408, 5xx) and network failures with exponential backoff - (0.5s, 1s delays between retries). + /flags retries are handled explicitly in ``flags()`` so that only + transport failures are retried. HTTP status responses must surface as API + errors without retrying. """ adapter = HTTPAdapterWithSocketOptions( - max_retries=Retry( - total=2, - connect=2, - read=2, - backoff_factor=0.5, - status_forcelist=RETRY_STATUS_FORCELIST, - allowed_methods=["POST"], - ), + max_retries=Retry(total=0, connect=0, read=0, status=0), socket_options=socket_options, ) session = requests.Session() @@ -306,26 +298,41 @@ def _process_response( raise APIError(res.status_code, res.text, retry_after=retry_after) +def _feature_flags_retry_delay(failed_attempt: int) -> float: + return FEATURE_FLAGS_RETRY_BACKOFF_SECONDS * (2**failed_attempt) + + def flags( api_key: str, host: Optional[str] = None, gzip: bool = False, timeout: int = 15, + max_retries: int = DEFAULT_FEATURE_FLAGS_REQUEST_MAX_RETRIES, **kwargs, ) -> Any: - """Post the kwargs to the flags API endpoint with automatic retries.""" - res = post( - api_key, - host, - "/flags/?v=2", - gzip, - timeout, - session=_get_flags_session(), - **kwargs, - ) - return _process_response( - res, success_message="Feature flags evaluated successfully" - ) + """Post the kwargs to the flags API endpoint with transport retries.""" + retries = max(0, max_retries) + failed_attempt = 0 + + while True: + try: + res = post( + api_key, + host, + "/flags/?v=2", + gzip, + timeout, + session=_get_flags_session(), + **kwargs, + ) + return _process_response( + res, success_message="Feature flags evaluated successfully" + ) + except (requests.exceptions.ConnectionError, requests.exceptions.Timeout): + if failed_attempt >= retries: + raise + time.sleep(_feature_flags_retry_delay(failed_attempt)) + failed_attempt += 1 def remote_config( diff --git a/posthog/test/test_client.py b/posthog/test/test_client.py index 10a173fa..bbbf5a15 100644 --- a/posthog/test/test_client.py +++ b/posthog/test/test_client.py @@ -984,6 +984,21 @@ def test_basic_capture_with_feature_flags_returns_active_only(self, patch_flags) device_id=None, ) + @mock.patch("posthog.client.flags") + def test_feature_flags_request_max_retries_is_forwarded_when_configured( + self, patch_flags + ): + patch_flags.return_value = {"featureFlags": {}, "featureFlagPayloads": {}} + client = Client( + FAKE_TEST_API_KEY, + feature_flags_request_max_retries=0, + personal_api_key=FAKE_TEST_API_KEY, + ) + + client.get_all_flags("distinct_id") + + self.assertEqual(patch_flags.call_args.kwargs["max_retries"], 0) + @mock.patch("posthog.client.flags") def test_basic_capture_with_feature_flags_and_disable_geoip_returns_correctly( self, patch_flags diff --git a/posthog/test/test_request.py b/posthog/test/test_request.py index 4cae68f9..cd20485c 100644 --- a/posthog/test/test_request.py +++ b/posthog/test/test_request.py @@ -504,17 +504,18 @@ def test_set_socket_options_is_idempotent(): class TestFlagsSession(unittest.TestCase): """Tests for flags session configuration.""" - def test_retry_status_forcelist_excludes_rate_limits(self): - """Verify 429 (rate limit) is NOT retried - need to wait, not hammer.""" - from posthog.request import RETRY_STATUS_FORCELIST - - self.assertNotIn(429, RETRY_STATUS_FORCELIST) + def test_flags_session_disables_adapter_retries(self): + """HTTP adapter retries are disabled; flags() retries transport errors.""" + from posthog.request import _build_flags_session - def test_retry_status_forcelist_excludes_quota_errors(self): - """Verify 402 (payment required/quota) is NOT retried - won't resolve.""" - from posthog.request import RETRY_STATUS_FORCELIST + session = _build_flags_session() + adapter = session.get_adapter("https://test.posthog.com") + retry = adapter.max_retries - self.assertNotIn(402, RETRY_STATUS_FORCELIST) + self.assertEqual(retry.total, 0) + self.assertEqual(retry.connect, 0) + self.assertEqual(retry.read, 0) + self.assertEqual(retry.status, 0) @mock.patch("posthog.request._get_flags_session") def test_flags_uses_flags_session(self, mock_get_flags_session): @@ -564,208 +565,105 @@ def test_flags_no_retry_on_quota_limit(self, mock_get_flags_session): self.assertEqual(mock_session.post.call_count, 1) -class TestFlagsSessionNetworkRetries(unittest.TestCase): - """Tests for network failure retries in the flags session.""" - - def test_flags_session_retry_config_includes_connection_errors(self): - """ - Verify that the flags session is configured to retry on connection errors. - - The urllib3 Retry adapter with connect=2 and read=2 automatically - retries on network-level failures (DNS failures, connection refused, - connection reset, etc.) up to 2 times each. - """ - from posthog.request import _build_flags_session - - session = _build_flags_session() +class TestFlagsRetries(unittest.TestCase): + """Tests for /flags retry behavior.""" - # Get the adapter for https:// - adapter = session.get_adapter("https://test.posthog.com") + @mock.patch("posthog.request.time.sleep") + @mock.patch("posthog.request._get_flags_session") + def test_flags_retries_transport_errors_once_by_default( + self, mock_get_flags_session, mock_sleep + ): + mock_response = requests.Response() + mock_response.status_code = 200 + mock_response._content = json.dumps( + { + "featureFlags": {"test-flag": True}, + "featureFlagPayloads": {}, + "errorsWhileComputingFlags": False, + } + ).encode("utf-8") - # Verify retry configuration - retry = adapter.max_retries - self.assertEqual(retry.total, 2, "Should have 2 total retries") - self.assertEqual(retry.connect, 2, "Should retry connection errors twice") - self.assertEqual(retry.read, 2, "Should retry read errors twice") - self.assertIn("POST", retry.allowed_methods, "Should allow POST retries") + mock_session = mock.MagicMock() + mock_session.post.side_effect = [ + requests.exceptions.ConnectionError("connection failed"), + mock_response, + ] + mock_get_flags_session.return_value = mock_session - def test_flags_session_retries_on_server_errors(self): - """ - Verify that transient server errors (5xx) trigger retries. + response = flags("test-key", "https://test.posthog.com", distinct_id="user123") - This tests the status_forcelist configuration which specifies - which HTTP status codes should trigger a retry. - """ - from posthog.request import _build_flags_session, RETRY_STATUS_FORCELIST + self.assertEqual(response["featureFlags"], {"test-flag": True}) + self.assertEqual(mock_session.post.call_count, 2) + mock_sleep.assert_called_once_with(0.3) - session = _build_flags_session() - adapter = session.get_adapter("https://test.posthog.com") - retry = adapter.max_retries + @mock.patch("posthog.request.time.sleep") + @mock.patch("posthog.request._get_flags_session") + def test_flags_retry_count_zero_disables_retries( + self, mock_get_flags_session, mock_sleep + ): + mock_session = mock.MagicMock() + mock_session.post.side_effect = requests.exceptions.Timeout("timed out") + mock_get_flags_session.return_value = mock_session - # Verify the status codes that trigger retries - self.assertEqual( - set(retry.status_forcelist), - set(RETRY_STATUS_FORCELIST), - "Should retry on transient server errors", - ) + with self.assertRaises(requests.exceptions.Timeout): + flags( + "test-key", + "https://test.posthog.com", + max_retries=0, + distinct_id="user123", + ) - # Verify specific codes are included - self.assertIn(500, retry.status_forcelist) - self.assertIn(502, retry.status_forcelist) - self.assertIn(503, retry.status_forcelist) - self.assertIn(504, retry.status_forcelist) + self.assertEqual(mock_session.post.call_count, 1) + mock_sleep.assert_not_called() - # Verify rate limits and quota errors are NOT retried - self.assertNotIn(429, retry.status_forcelist) - self.assertNotIn(402, retry.status_forcelist) + @mock.patch("posthog.request.time.sleep") + @mock.patch("posthog.request._get_flags_session") + def test_flags_retry_delay_starts_at_300ms_and_doubles( + self, mock_get_flags_session, mock_sleep + ): + mock_response = requests.Response() + mock_response.status_code = 200 + mock_response._content = json.dumps( + { + "featureFlags": {"test-flag": True}, + "featureFlagPayloads": {}, + "errorsWhileComputingFlags": False, + } + ).encode("utf-8") - def test_flags_session_has_backoff(self): - """ - Verify that retries use exponential backoff to avoid thundering herd. - """ - from posthog.request import _build_flags_session + mock_session = mock.MagicMock() + mock_session.post.side_effect = [ + requests.exceptions.ConnectionError("connection failed"), + requests.exceptions.Timeout("timed out"), + mock_response, + ] + mock_get_flags_session.return_value = mock_session - session = _build_flags_session() - adapter = session.get_adapter("https://test.posthog.com") - retry = adapter.max_retries + response = flags( + "test-key", + "https://test.posthog.com", + max_retries=2, + distinct_id="user123", + ) + self.assertEqual(response["featureFlags"], {"test-flag": True}) + self.assertEqual(mock_session.post.call_count, 3) self.assertEqual( - retry.backoff_factor, - 0.5, - "Should use 0.5s backoff factor (0.5s, 1s delays)", + [call.args[0] for call in mock_sleep.call_args_list], [0.3, 0.6] ) + @mock.patch("posthog.request._get_flags_session") + def test_flags_does_not_retry_http_status_errors(self, mock_get_flags_session): + mock_response = requests.Response() + mock_response.status_code = 503 + mock_response._content = b'{"detail": "Service unavailable"}' -class TestFlagsSessionRetryIntegration(unittest.TestCase): - """Integration tests that verify actual retry behavior with a local server.""" - - def test_retries_on_503_then_succeeds(self): - """ - Verify that 503 errors trigger retries and eventually succeed. - - Uses a local HTTP server that fails twice with 503, then succeeds. - This tests the full retry flow including backoff timing. - """ - import threading - from http.server import HTTPServer, BaseHTTPRequestHandler - from socketserver import ThreadingMixIn - from urllib3.util.retry import Retry - from posthog.request import HTTPAdapterWithSocketOptions, RETRY_STATUS_FORCELIST - - request_count = 0 - - class RetryTestHandler(BaseHTTPRequestHandler): - protocol_version = "HTTP/1.1" - - def do_POST(self): - nonlocal request_count - request_count += 1 - - # Read and discard request body to prevent connection issues - content_length = int(self.headers.get("Content-Length", 0)) - if content_length > 0: - self.rfile.read(content_length) - - if request_count <= 2: - self.send_response(503) - self.send_header("Content-Type", "application/json") - body = b'{"error": "Service unavailable"}' - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body) - else: - self.send_response(200) - self.send_header("Content-Type", "application/json") - body = ( - b'{"featureFlags": {"test": true}, "featureFlagPayloads": {}}' - ) - self.send_header("Content-Length", str(len(body))) - self.end_headers() - self.wfile.write(body) - - def log_message(self, format, *args): - pass # Suppress logging - - # Use ThreadingMixIn for cleaner shutdown - class ThreadedHTTPServer(ThreadingMixIn, HTTPServer): - daemon_threads = True - - # Start server on a random available port - server = ThreadedHTTPServer(("127.0.0.1", 0), RetryTestHandler) - port = server.server_address[1] - server_thread = threading.Thread(target=server.serve_forever) - server_thread.daemon = True - server_thread.start() - - try: - # Build session with same retry config as _build_flags_session - # but mounted on http:// for local testing - adapter = HTTPAdapterWithSocketOptions( - max_retries=Retry( - total=2, - connect=2, - read=2, - backoff_factor=0.01, # Fast backoff for testing - status_forcelist=RETRY_STATUS_FORCELIST, - allowed_methods=["POST"], - ), - ) - session = requests.Session() - session.mount("http://", adapter) - - response = session.post( - f"http://127.0.0.1:{port}/flags/?v=2", - json={"distinct_id": "user123"}, - timeout=5, - ) + mock_session = mock.MagicMock() + mock_session.post.return_value = mock_response + mock_get_flags_session.return_value = mock_session - # Should succeed on 3rd attempt - self.assertEqual(response.status_code, 200) - self.assertEqual(request_count, 3) # 1 initial + 2 retries - finally: - server.shutdown() - server.server_close() - - def test_connection_errors_are_retried(self): - """ - Verify that connection errors (no server) trigger retries. - - Binds a socket to get a guaranteed available port, then closes it - so connection attempts fail with ConnectionError. - """ - import socket - import time - from urllib3.util.retry import Retry - from posthog.request import HTTPAdapterWithSocketOptions, RETRY_STATUS_FORCELIST - - # Get an available port by binding then closing a socket - sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) - sock.bind(("127.0.0.1", 0)) - port = sock.getsockname()[1] - sock.close() # Port is now available but nothing is listening - - adapter = HTTPAdapterWithSocketOptions( - max_retries=Retry( - total=2, - connect=2, - read=2, - backoff_factor=0.05, # Very fast for testing - status_forcelist=RETRY_STATUS_FORCELIST, - allowed_methods=["POST"], - ), - ) - session = requests.Session() - session.mount("http://", adapter) - - start = time.time() - with self.assertRaises(requests.exceptions.ConnectionError): - session.post( - f"http://127.0.0.1:{port}/flags/?v=2", - json={"distinct_id": "user123"}, - timeout=1, - ) - elapsed = time.time() - start + with self.assertRaises(APIError) as cm: + flags("test-key", "https://test.posthog.com", distinct_id="user123") - # With 3 attempts and backoff, should take more than instant - # but less than timeout (confirms retries happened) - self.assertGreater(elapsed, 0.05, "Should have some delay from retries") + self.assertEqual(cm.exception.status, 503) + self.assertEqual(mock_session.post.call_count, 1)