From f3f8a42729e5c117361c5413d24776964924eab4 Mon Sep 17 00:00:00 2001 From: Marcelo Trylesinski Date: Thu, 4 Jun 2026 18:46:12 +0200 Subject: [PATCH] Order multipart parser states by frequency in the dispatch ladder The PART_DATA state machine walked its `if/elif state == ...` ladder in lifecycle order, so the hottest states (PART_DATA, the header states) sat near the bottom and paid a failed comparison for every state above them - ~84 state comparisons per part for a 100-field form. Reorder the branches by how often each state is actually active: boundary and part-data first, the once-per-stream START/END states last. The branches are mutually exclusive on `state`, so this is a pure reorder with no behavior change - verified byte-identical to the prior parser across ~135k differential comparisons (every chunk-split strategy incl. byte-by-byte and boundary-edge sweeps, crossed with callback subsets). large ~9% faster (400 to 363 us), simple ~7%, worstcase_crlf ~10%; file upload unchanged. --- python_multipart/multipart.py | 288 +++++++++++++++++----------------- 1 file changed, 145 insertions(+), 143 deletions(-) diff --git a/python_multipart/multipart.py b/python_multipart/multipart.py index 887cf60..2ec3d16 100644 --- a/python_multipart/multipart.py +++ b/python_multipart/multipart.py @@ -1146,29 +1146,13 @@ def data_callback(name: CallbackName, end_i: int, remaining: bool = False) -> No else: self.marks.pop(name, None) - # For each byte... + # For each byte... Branches are ordered by how often each state is the + # active one (boundary and part-data states first), so the hot path walks + # fewer comparisons per iteration rather than reading in lifecycle order. while i < length: c = data[i] - if state == MultipartState.START: - # Skip leading newlines - if c == CR or c == LF: - i = data.find(b"-", i) - if i == -1: - # No boundary candidate in this chunk, so ignore the content after the leading CR/LF. - i = length - break - continue - - # index is used as in index into our boundary. Set to 0. - index = 0 - - # Move to the next state, but decrement i so that we re-process - # this character. - state = MultipartState.START_BOUNDARY - i -= 1 - - elif state == MultipartState.START_BOUNDARY: + if state == MultipartState.START_BOUNDARY: # Check to ensure that the last 2 characters in our boundary # are CRLF. if index == boundary_length - 2: @@ -1210,6 +1194,129 @@ def data_callback(name: CallbackName, end_i: int, remaining: bool = False) -> No # Increment index into boundary and continue. index += 1 + elif state == MultipartState.PART_DATA: + # We're processing our part data right now. During this, we + # need to efficiently search for our boundary, since any data + # on any number of lines can be a part of the current data. + + # Save the current value of our index. We use this in case we + # find part of a boundary, but it doesn't match fully. + prev_index = index + + # If our index is 0, we're starting a new part, so start our + # search. + if index == 0: + # The most common case is likely to be that the whole + # boundary is present in the buffer. + # Calling `find` is much faster than iterating here. + i0 = data.find(boundary, i, length) + if i0 >= 0: + # We matched the whole boundary string. + index = boundary_length - 1 + i = i0 + boundary_length - 1 + c = data[i] + else: + # No whole boundary, but the tail may hold a partial one + # that completes in the next chunk. Boundary starts with + # CR, which an RFC boundary contains nowhere else, so the + # last CR in the tail is the only candidate prefix start. + k = data.rfind(boundary[:1], max(i, length - boundary_length + 1), length) + if k != -1 and boundary.startswith(data[k:length]): + index = length - k + # Carry the partial via index; the end-of-chunk flush + # emits the data before it and re-marks the lookbehind. + i = length + continue + + # Now, we have a couple of cases here. If our index is before + # the end of the boundary... + if index < boundary_length: + # If the character matches... + if boundary[index] == c: + # The current character matches, so continue! + index += 1 + else: + index = 0 + + # Our index is equal to the length of our boundary! + elif index == boundary_length: + # First we increment it. + index += 1 + + # Now, if we've reached a newline, we need to set this as + # the potential end of our boundary. + if c == CR: + flags |= FLAG_PART_BOUNDARY + + # Otherwise, if this is a hyphen, we might be at the last + # of all boundaries. + elif c == HYPHEN: + flags |= FLAG_LAST_BOUNDARY + + # Otherwise, we reset our index, since this isn't either a + # newline or a hyphen. + else: + index = 0 + + # Our index is right after the part boundary, which should be + # a LF. + elif index == boundary_length + 1: + # If we're at a part boundary (i.e. we've seen a CR + # character already)... + if flags & FLAG_PART_BOUNDARY: + # We need a LF character next. + if c == LF: + # Unset the part boundary flag. + flags &= ~FLAG_PART_BOUNDARY + + # We have identified a boundary, callback for any data before it. + data_callback("part_data", i - index) + # Callback indicating that we've reached the end of + # a part, and are starting a new one. + self.callback("part_end") + self.callback("part_begin") + current_header_count = 0 + current_header_size = 0 + + # Move to parsing new headers. + index = 0 + state = MultipartState.HEADER_FIELD_START + i += 1 + continue + + # We didn't find an LF character, so no match. Reset + # our index and clear our flag. + index = 0 + flags &= ~FLAG_PART_BOUNDARY + + # Otherwise, if we're at the last boundary (i.e. we've + # seen a hyphen already)... + elif flags & FLAG_LAST_BOUNDARY: + # We need a second hyphen here. + if c == HYPHEN: + # We have identified a boundary, callback for any data before it. + data_callback("part_data", i - index) + # Callback to end the current part, and then the + # message. + self.callback("part_end") + self.callback("end") + state = MultipartState.END + else: + # No match, so reset index. + index = 0 + + # Otherwise, our index is 0. If the previous index is not, it + # means we reset something, and we need to take the data we + # thought was part of our boundary and send it along as actual + # data. + if index == 0 and prev_index > 0: + # Overwrite our previous index. + prev_index = 0 + + # Re-consider the current character, since this could be + # the start of the boundary itself. + i -= 1 + elif state == MultipartState.HEADER_FIELD_START: # Mark the start of a header field here, reset the index, and # continue parsing our header field. @@ -1341,129 +1448,6 @@ def data_callback(name: CallbackName, end_i: int, remaining: bool = False) -> No state = MultipartState.PART_DATA i -= 1 - elif state == MultipartState.PART_DATA: - # We're processing our part data right now. During this, we - # need to efficiently search for our boundary, since any data - # on any number of lines can be a part of the current data. - - # Save the current value of our index. We use this in case we - # find part of a boundary, but it doesn't match fully. - prev_index = index - - # If our index is 0, we're starting a new part, so start our - # search. - if index == 0: - # The most common case is likely to be that the whole - # boundary is present in the buffer. - # Calling `find` is much faster than iterating here. - i0 = data.find(boundary, i, length) - if i0 >= 0: - # We matched the whole boundary string. - index = boundary_length - 1 - i = i0 + boundary_length - 1 - c = data[i] - else: - # No whole boundary, but the tail may hold a partial one - # that completes in the next chunk. Boundary starts with - # CR, which an RFC boundary contains nowhere else, so the - # last CR in the tail is the only candidate prefix start. - k = data.rfind(boundary[:1], max(i, length - boundary_length + 1), length) - if k != -1 and boundary.startswith(data[k:length]): - index = length - k - # Carry the partial via index; the end-of-chunk flush - # emits the data before it and re-marks the lookbehind. - i = length - continue - - # Now, we have a couple of cases here. If our index is before - # the end of the boundary... - if index < boundary_length: - # If the character matches... - if boundary[index] == c: - # The current character matches, so continue! - index += 1 - else: - index = 0 - - # Our index is equal to the length of our boundary! - elif index == boundary_length: - # First we increment it. - index += 1 - - # Now, if we've reached a newline, we need to set this as - # the potential end of our boundary. - if c == CR: - flags |= FLAG_PART_BOUNDARY - - # Otherwise, if this is a hyphen, we might be at the last - # of all boundaries. - elif c == HYPHEN: - flags |= FLAG_LAST_BOUNDARY - - # Otherwise, we reset our index, since this isn't either a - # newline or a hyphen. - else: - index = 0 - - # Our index is right after the part boundary, which should be - # a LF. - elif index == boundary_length + 1: - # If we're at a part boundary (i.e. we've seen a CR - # character already)... - if flags & FLAG_PART_BOUNDARY: - # We need a LF character next. - if c == LF: - # Unset the part boundary flag. - flags &= ~FLAG_PART_BOUNDARY - - # We have identified a boundary, callback for any data before it. - data_callback("part_data", i - index) - # Callback indicating that we've reached the end of - # a part, and are starting a new one. - self.callback("part_end") - self.callback("part_begin") - current_header_count = 0 - current_header_size = 0 - - # Move to parsing new headers. - index = 0 - state = MultipartState.HEADER_FIELD_START - i += 1 - continue - - # We didn't find an LF character, so no match. Reset - # our index and clear our flag. - index = 0 - flags &= ~FLAG_PART_BOUNDARY - - # Otherwise, if we're at the last boundary (i.e. we've - # seen a hyphen already)... - elif flags & FLAG_LAST_BOUNDARY: - # We need a second hyphen here. - if c == HYPHEN: - # We have identified a boundary, callback for any data before it. - data_callback("part_data", i - index) - # Callback to end the current part, and then the - # message. - self.callback("part_end") - self.callback("end") - state = MultipartState.END - else: - # No match, so reset index. - index = 0 - - # Otherwise, our index is 0. If the previous index is not, it - # means we reset something, and we need to take the data we - # thought was part of our boundary and send it along as actual - # data. - if index == 0 and prev_index > 0: - # Overwrite our previous index. - prev_index = 0 - - # Re-consider the current character, since this could be - # the start of the boundary itself. - i -= 1 - elif state == MultipartState.END_BOUNDARY: if index == boundary_length - 1: if c != HYPHEN: @@ -1474,6 +1458,24 @@ def data_callback(name: CallbackName, end_i: int, remaining: bool = False) -> No self.callback("end") state = MultipartState.END + elif state == MultipartState.START: + # Skip leading newlines + if c == CR or c == LF: + i = data.find(b"-", i) + if i == -1: + # No boundary candidate in this chunk, so ignore the content after the leading CR/LF. + i = length + break + continue + + # index is used as in index into our boundary. Set to 0. + index = 0 + + # Move to the next state, but decrement i so that we re-process + # this character. + state = MultipartState.START_BOUNDARY + i -= 1 + elif state == MultipartState.END: # Silently discard any epilogue data (RFC 2046 section 5.1.1 allows a CRLF and optional # epilogue after the closing boundary). Django and Werkzeug do the same.