Skip to content
Snippets Groups Projects
Commit 0a398936 authored by Dean Jukes's avatar Dean Jukes
Browse files

venv

parents
No related branches found
No related tags found
No related merge requests found
Showing
with 1077 additions and 0 deletions
File added
File added
from __future__ import absolute_import
from kafka.admin.config_resource import ConfigResource, ConfigResourceType
from kafka.admin.client import KafkaAdminClient
from kafka.admin.acl_resource import (ACL, ACLFilter, ResourcePattern, ResourcePatternFilter, ACLOperation,
ResourceType, ACLPermissionType, ACLResourcePatternType)
from kafka.admin.new_topic import NewTopic
from kafka.admin.new_partitions import NewPartitions
__all__ = [
'ConfigResource', 'ConfigResourceType', 'KafkaAdminClient', 'NewTopic', 'NewPartitions', 'ACL', 'ACLFilter',
'ResourcePattern', 'ResourcePatternFilter', 'ACLOperation', 'ResourceType', 'ACLPermissionType',
'ACLResourcePatternType'
]
File added
File added
File added
from __future__ import absolute_import
from kafka.errors import IllegalArgumentError
# enum in stdlib as of py3.4
try:
from enum import IntEnum # pylint: disable=import-error
except ImportError:
# vendored backport module
from kafka.vendor.enum34 import IntEnum
class ResourceType(IntEnum):
"""Type of kafka resource to set ACL for
The ANY value is only valid in a filter context
"""
UNKNOWN = 0,
ANY = 1,
CLUSTER = 4,
DELEGATION_TOKEN = 6,
GROUP = 3,
TOPIC = 2,
TRANSACTIONAL_ID = 5
class ACLOperation(IntEnum):
"""Type of operation
The ANY value is only valid in a filter context
"""
ANY = 1,
ALL = 2,
READ = 3,
WRITE = 4,
CREATE = 5,
DELETE = 6,
ALTER = 7,
DESCRIBE = 8,
CLUSTER_ACTION = 9,
DESCRIBE_CONFIGS = 10,
ALTER_CONFIGS = 11,
IDEMPOTENT_WRITE = 12
class ACLPermissionType(IntEnum):
"""An enumerated type of permissions
The ANY value is only valid in a filter context
"""
ANY = 1,
DENY = 2,
ALLOW = 3
class ACLResourcePatternType(IntEnum):
"""An enumerated type of resource patterns
More details on the pattern types and how they work
can be found in KIP-290 (Support for prefixed ACLs)
https://cwiki.apache.org/confluence/display/KAFKA/KIP-290%3A+Support+for+Prefixed+ACLs
"""
ANY = 1,
MATCH = 2,
LITERAL = 3,
PREFIXED = 4
class ACLFilter(object):
"""Represents a filter to use with describing and deleting ACLs
The difference between this class and the ACL class is mainly that
we allow using ANY with the operation, permission, and resource type objects
to fetch ALCs matching any of the properties.
To make a filter matching any principal, set principal to None
"""
def __init__(
self,
principal,
host,
operation,
permission_type,
resource_pattern
):
self.principal = principal
self.host = host
self.operation = operation
self.permission_type = permission_type
self.resource_pattern = resource_pattern
self.validate()
def validate(self):
if not isinstance(self.operation, ACLOperation):
raise IllegalArgumentError("operation must be an ACLOperation object, and cannot be ANY")
if not isinstance(self.permission_type, ACLPermissionType):
raise IllegalArgumentError("permission_type must be an ACLPermissionType object, and cannot be ANY")
if not isinstance(self.resource_pattern, ResourcePatternFilter):
raise IllegalArgumentError("resource_pattern must be a ResourcePatternFilter object")
def __repr__(self):
return "<ACL principal={principal}, resource={resource}, operation={operation}, type={type}, host={host}>".format(
principal=self.principal,
host=self.host,
operation=self.operation.name,
type=self.permission_type.name,
resource=self.resource_pattern
)
def __eq__(self, other):
return all((
self.principal == other.principal,
self.host == other.host,
self.operation == other.operation,
self.permission_type == other.permission_type,
self.resource_pattern == other.resource_pattern
))
def __hash__(self):
return hash((
self.principal,
self.host,
self.operation,
self.permission_type,
self.resource_pattern,
))
class ACL(ACLFilter):
"""Represents a concrete ACL for a specific ResourcePattern
In kafka an ACL is a 4-tuple of (principal, host, operation, permission_type)
that limits who can do what on a specific resource (or since KIP-290 a resource pattern)
Terminology:
Principal -> This is the identifier for the user. Depending on the authorization method used (SSL, SASL etc)
the principal will look different. See http://kafka.apache.org/documentation/#security_authz for details.
The principal must be on the format "User:<name>" or kafka will treat it as invalid. It's possible to use
other principal types than "User" if using a custom authorizer for the cluster.
Host -> This must currently be an IP address. It cannot be a range, and it cannot be a domain name.
It can be set to "*", which is special cased in kafka to mean "any host"
Operation -> Which client operation this ACL refers to. Has different meaning depending
on the resource type the ACL refers to. See https://docs.confluent.io/current/kafka/authorization.html#acl-format
for a list of which combinations of resource/operation that unlocks which kafka APIs
Permission Type: Whether this ACL is allowing or denying access
Resource Pattern -> This is a representation of the resource or resource pattern that the ACL
refers to. See the ResourcePattern class for details.
"""
def __init__(
self,
principal,
host,
operation,
permission_type,
resource_pattern
):
super(ACL, self).__init__(principal, host, operation, permission_type, resource_pattern)
self.validate()
def validate(self):
if self.operation == ACLOperation.ANY:
raise IllegalArgumentError("operation cannot be ANY")
if self.permission_type == ACLPermissionType.ANY:
raise IllegalArgumentError("permission_type cannot be ANY")
if not isinstance(self.resource_pattern, ResourcePattern):
raise IllegalArgumentError("resource_pattern must be a ResourcePattern object")
class ResourcePatternFilter(object):
def __init__(
self,
resource_type,
resource_name,
pattern_type
):
self.resource_type = resource_type
self.resource_name = resource_name
self.pattern_type = pattern_type
self.validate()
def validate(self):
if not isinstance(self.resource_type, ResourceType):
raise IllegalArgumentError("resource_type must be a ResourceType object")
if not isinstance(self.pattern_type, ACLResourcePatternType):
raise IllegalArgumentError("pattern_type must be an ACLResourcePatternType object")
def __repr__(self):
return "<ResourcePattern type={}, name={}, pattern={}>".format(
self.resource_type.name,
self.resource_name,
self.pattern_type.name
)
def __eq__(self, other):
return all((
self.resource_type == other.resource_type,
self.resource_name == other.resource_name,
self.pattern_type == other.pattern_type,
))
def __hash__(self):
return hash((
self.resource_type,
self.resource_name,
self.pattern_type
))
class ResourcePattern(ResourcePatternFilter):
"""A resource pattern to apply the ACL to
Resource patterns are used to be able to specify which resources an ACL
describes in a more flexible way than just pointing to a literal topic name for example.
Since KIP-290 (kafka 2.0) it's possible to set an ACL for a prefixed resource name, which
can cut down considerably on the number of ACLs needed when the number of topics and
consumer groups start to grow.
The default pattern_type is LITERAL, and it describes a specific resource. This is also how
ACLs worked before the introduction of prefixed ACLs
"""
def __init__(
self,
resource_type,
resource_name,
pattern_type=ACLResourcePatternType.LITERAL
):
super(ResourcePattern, self).__init__(resource_type, resource_name, pattern_type)
self.validate()
def validate(self):
if self.resource_type == ResourceType.ANY:
raise IllegalArgumentError("resource_type cannot be ANY")
if self.pattern_type in [ACLResourcePatternType.ANY, ACLResourcePatternType.MATCH]:
raise IllegalArgumentError(
"pattern_type cannot be {} on a concrete ResourcePattern".format(self.pattern_type.name)
)
This diff is collapsed.
from __future__ import absolute_import
# enum in stdlib as of py3.4
try:
from enum import IntEnum # pylint: disable=import-error
except ImportError:
# vendored backport module
from kafka.vendor.enum34 import IntEnum
class ConfigResourceType(IntEnum):
"""An enumerated type of config resources"""
BROKER = 4,
TOPIC = 2
class ConfigResource(object):
"""A class for specifying config resources.
Arguments:
resource_type (ConfigResourceType): the type of kafka resource
name (string): The name of the kafka resource
configs ({key : value}): A maps of config keys to values.
"""
def __init__(
self,
resource_type,
name,
configs=None
):
if not isinstance(resource_type, (ConfigResourceType)):
resource_type = ConfigResourceType[str(resource_type).upper()] # pylint: disable-msg=unsubscriptable-object
self.resource_type = resource_type
self.name = name
self.configs = configs
from __future__ import absolute_import
class NewPartitions(object):
"""A class for new partition creation on existing topics. Note that the length of new_assignments, if specified,
must be the difference between the new total number of partitions and the existing number of partitions.
Arguments:
total_count (int): the total number of partitions that should exist on the topic
new_assignments ([[int]]): an array of arrays of replica assignments for new partitions.
If not set, broker assigns replicas per an internal algorithm.
"""
def __init__(
self,
total_count,
new_assignments=None
):
self.total_count = total_count
self.new_assignments = new_assignments
from __future__ import absolute_import
from kafka.errors import IllegalArgumentError
class NewTopic(object):
""" A class for new topic creation
Arguments:
name (string): name of the topic
num_partitions (int): number of partitions
or -1 if replica_assignment has been specified
replication_factor (int): replication factor or -1 if
replica assignment is specified
replica_assignment (dict of int: [int]): A mapping containing
partition id and replicas to assign to it.
topic_configs (dict of str: str): A mapping of config key
and value for the topic.
"""
def __init__(
self,
name,
num_partitions,
replication_factor,
replica_assignments=None,
topic_configs=None,
):
if not (num_partitions == -1 or replication_factor == -1) ^ (replica_assignments is None):
raise IllegalArgumentError('either num_partitions/replication_factor or replica_assignment must be specified')
self.name = name
self.num_partitions = num_partitions
self.replication_factor = replication_factor
self.replica_assignments = replica_assignments or {}
self.topic_configs = topic_configs or {}
This diff is collapsed.
from __future__ import absolute_import
import collections
import copy
import logging
import threading
import time
from kafka.vendor import six
from kafka import errors as Errors
from kafka.conn import collect_hosts
from kafka.future import Future
from kafka.structs import BrokerMetadata, PartitionMetadata, TopicPartition
log = logging.getLogger(__name__)
class ClusterMetadata(object):
"""
A class to manage kafka cluster metadata.
This class does not perform any IO. It simply updates internal state
given API responses (MetadataResponse, GroupCoordinatorResponse).
Keyword Arguments:
retry_backoff_ms (int): Milliseconds to backoff when retrying on
errors. Default: 100.
metadata_max_age_ms (int): The period of time in milliseconds after
which we force a refresh of metadata even if we haven't seen any
partition leadership changes to proactively discover any new
brokers or partitions. Default: 300000
bootstrap_servers: 'host[:port]' string (or list of 'host[:port]'
strings) that the client should contact to bootstrap initial
cluster metadata. This does not have to be the full node list.
It just needs to have at least one broker that will respond to a
Metadata API Request. Default port is 9092. If no servers are
specified, will default to localhost:9092.
"""
DEFAULT_CONFIG = {
'retry_backoff_ms': 100,
'metadata_max_age_ms': 300000,
'bootstrap_servers': [],
}
def __init__(self, **configs):
self._brokers = {} # node_id -> BrokerMetadata
self._partitions = {} # topic -> partition -> PartitionMetadata
self._broker_partitions = collections.defaultdict(set) # node_id -> {TopicPartition...}
self._groups = {} # group_name -> node_id
self._last_refresh_ms = 0
self._last_successful_refresh_ms = 0
self._need_update = True
self._future = None
self._listeners = set()
self._lock = threading.Lock()
self.need_all_topic_metadata = False
self.unauthorized_topics = set()
self.internal_topics = set()
self.controller = None
self.config = copy.copy(self.DEFAULT_CONFIG)
for key in self.config:
if key in configs:
self.config[key] = configs[key]
self._bootstrap_brokers = self._generate_bootstrap_brokers()
self._coordinator_brokers = {}
def _generate_bootstrap_brokers(self):
# collect_hosts does not perform DNS, so we should be fine to re-use
bootstrap_hosts = collect_hosts(self.config['bootstrap_servers'])
brokers = {}
for i, (host, port, _) in enumerate(bootstrap_hosts):
node_id = 'bootstrap-%s' % i
brokers[node_id] = BrokerMetadata(node_id, host, port, None)
return brokers
def is_bootstrap(self, node_id):
return node_id in self._bootstrap_brokers
def brokers(self):
"""Get all BrokerMetadata
Returns:
set: {BrokerMetadata, ...}
"""
return set(self._brokers.values()) or set(self._bootstrap_brokers.values())
def broker_metadata(self, broker_id):
"""Get BrokerMetadata
Arguments:
broker_id (int): node_id for a broker to check
Returns:
BrokerMetadata or None if not found
"""
return (
self._brokers.get(broker_id) or
self._bootstrap_brokers.get(broker_id) or
self._coordinator_brokers.get(broker_id)
)
def partitions_for_topic(self, topic):
"""Return set of all partitions for topic (whether available or not)
Arguments:
topic (str): topic to check for partitions
Returns:
set: {partition (int), ...}
"""
if topic not in self._partitions:
return None
return set(self._partitions[topic].keys())
def available_partitions_for_topic(self, topic):
"""Return set of partitions with known leaders
Arguments:
topic (str): topic to check for partitions
Returns:
set: {partition (int), ...}
None if topic not found.
"""
if topic not in self._partitions:
return None
return set([partition for partition, metadata
in six.iteritems(self._partitions[topic])
if metadata.leader != -1])
def leader_for_partition(self, partition):
"""Return node_id of leader, -1 unavailable, None if unknown."""
if partition.topic not in self._partitions:
return None
elif partition.partition not in self._partitions[partition.topic]:
return None
return self._partitions[partition.topic][partition.partition].leader
def partitions_for_broker(self, broker_id):
"""Return TopicPartitions for which the broker is a leader.
Arguments:
broker_id (int): node id for a broker
Returns:
set: {TopicPartition, ...}
None if the broker either has no partitions or does not exist.
"""
return self._broker_partitions.get(broker_id)
def coordinator_for_group(self, group):
"""Return node_id of group coordinator.
Arguments:
group (str): name of consumer group
Returns:
int: node_id for group coordinator
None if the group does not exist.
"""
return self._groups.get(group)
def ttl(self):
"""Milliseconds until metadata should be refreshed"""
now = time.time() * 1000
if self._need_update:
ttl = 0
else:
metadata_age = now - self._last_successful_refresh_ms
ttl = self.config['metadata_max_age_ms'] - metadata_age
retry_age = now - self._last_refresh_ms
next_retry = self.config['retry_backoff_ms'] - retry_age
return max(ttl, next_retry, 0)
def refresh_backoff(self):
"""Return milliseconds to wait before attempting to retry after failure"""
return self.config['retry_backoff_ms']
def request_update(self):
"""Flags metadata for update, return Future()
Actual update must be handled separately. This method will only
change the reported ttl()
Returns:
kafka.future.Future (value will be the cluster object after update)
"""
with self._lock:
self._need_update = True
if not self._future or self._future.is_done:
self._future = Future()
return self._future
def topics(self, exclude_internal_topics=True):
"""Get set of known topics.
Arguments:
exclude_internal_topics (bool): Whether records from internal topics
(such as offsets) should be exposed to the consumer. If set to
True the only way to receive records from an internal topic is
subscribing to it. Default True
Returns:
set: {topic (str), ...}
"""
topics = set(self._partitions.keys())
if exclude_internal_topics:
return topics - self.internal_topics
else:
return topics
def failed_update(self, exception):
"""Update cluster state given a failed MetadataRequest."""
f = None
with self._lock:
if self._future:
f = self._future
self._future = None
if f:
f.failure(exception)
self._last_refresh_ms = time.time() * 1000
def update_metadata(self, metadata):
"""Update cluster state given a MetadataResponse.
Arguments:
metadata (MetadataResponse): broker response to a metadata request
Returns: None
"""
# In the common case where we ask for a single topic and get back an
# error, we should fail the future
if len(metadata.topics) == 1 and metadata.topics[0][0] != 0:
error_code, topic = metadata.topics[0][:2]
error = Errors.for_code(error_code)(topic)
return self.failed_update(error)
if not metadata.brokers:
log.warning("No broker metadata found in MetadataResponse -- ignoring.")
return self.failed_update(Errors.MetadataEmptyBrokerList(metadata))
_new_brokers = {}
for broker in metadata.brokers:
if metadata.API_VERSION == 0:
node_id, host, port = broker
rack = None
else:
node_id, host, port, rack = broker
_new_brokers.update({
node_id: BrokerMetadata(node_id, host, port, rack)
})
if metadata.API_VERSION == 0:
_new_controller = None
else:
_new_controller = _new_brokers.get(metadata.controller_id)
_new_partitions = {}
_new_broker_partitions = collections.defaultdict(set)
_new_unauthorized_topics = set()
_new_internal_topics = set()
for topic_data in metadata.topics:
if metadata.API_VERSION == 0:
error_code, topic, partitions = topic_data
is_internal = False
else:
error_code, topic, is_internal, partitions = topic_data
if is_internal:
_new_internal_topics.add(topic)
error_type = Errors.for_code(error_code)
if error_type is Errors.NoError:
_new_partitions[topic] = {}
for p_error, partition, leader, replicas, isr in partitions:
_new_partitions[topic][partition] = PartitionMetadata(
topic=topic, partition=partition, leader=leader,
replicas=replicas, isr=isr, error=p_error)
if leader != -1:
_new_broker_partitions[leader].add(
TopicPartition(topic, partition))
# Specific topic errors can be ignored if this is a full metadata fetch
elif self.need_all_topic_metadata:
continue
elif error_type is Errors.LeaderNotAvailableError:
log.warning("Topic %s is not available during auto-create"
" initialization", topic)
elif error_type is Errors.UnknownTopicOrPartitionError:
log.error("Topic %s not found in cluster metadata", topic)
elif error_type is Errors.TopicAuthorizationFailedError:
log.error("Topic %s is not authorized for this client", topic)
_new_unauthorized_topics.add(topic)
elif error_type is Errors.InvalidTopicError:
log.error("'%s' is not a valid topic name", topic)
else:
log.error("Error fetching metadata for topic %s: %s",
topic, error_type)
with self._lock:
self._brokers = _new_brokers
self.controller = _new_controller
self._partitions = _new_partitions
self._broker_partitions = _new_broker_partitions
self.unauthorized_topics = _new_unauthorized_topics
self.internal_topics = _new_internal_topics
f = None
if self._future:
f = self._future
self._future = None
self._need_update = False
now = time.time() * 1000
self._last_refresh_ms = now
self._last_successful_refresh_ms = now
if f:
f.success(self)
log.debug("Updated cluster metadata to %s", self)
for listener in self._listeners:
listener(self)
if self.need_all_topic_metadata:
# the listener may change the interested topics,
# which could cause another metadata refresh.
# If we have already fetched all topics, however,
# another fetch should be unnecessary.
self._need_update = False
def add_listener(self, listener):
"""Add a callback function to be called on each metadata update"""
self._listeners.add(listener)
def remove_listener(self, listener):
"""Remove a previously added listener callback"""
self._listeners.remove(listener)
def add_group_coordinator(self, group, response):
"""Update with metadata for a group coordinator
Arguments:
group (str): name of group from GroupCoordinatorRequest
response (GroupCoordinatorResponse): broker response
Returns:
string: coordinator node_id if metadata is updated, None on error
"""
log.debug("Updating coordinator for %s: %s", group, response)
error_type = Errors.for_code(response.error_code)
if error_type is not Errors.NoError:
log.error("GroupCoordinatorResponse error: %s", error_type)
self._groups[group] = -1
return
# Use a coordinator-specific node id so that group requests
# get a dedicated connection
node_id = 'coordinator-{}'.format(response.coordinator_id)
coordinator = BrokerMetadata(
node_id,
response.host,
response.port,
None)
log.info("Group coordinator for %s is %s", group, coordinator)
self._coordinator_brokers[node_id] = coordinator
self._groups[group] = node_id
return node_id
def with_partitions(self, partitions_to_add):
"""Returns a copy of cluster metadata with partitions added"""
new_metadata = ClusterMetadata(**self.config)
new_metadata._brokers = copy.deepcopy(self._brokers)
new_metadata._partitions = copy.deepcopy(self._partitions)
new_metadata._broker_partitions = copy.deepcopy(self._broker_partitions)
new_metadata._groups = copy.deepcopy(self._groups)
new_metadata.internal_topics = copy.deepcopy(self.internal_topics)
new_metadata.unauthorized_topics = copy.deepcopy(self.unauthorized_topics)
for partition in partitions_to_add:
new_metadata._partitions[partition.topic][partition.partition] = partition
if partition.leader is not None and partition.leader != -1:
new_metadata._broker_partitions[partition.leader].add(
TopicPartition(partition.topic, partition.partition))
return new_metadata
def __str__(self):
return 'ClusterMetadata(brokers: %d, topics: %d, groups: %d)' % \
(len(self._brokers), len(self._partitions), len(self._groups))
from __future__ import absolute_import
import gzip
import io
import platform
import struct
from kafka.vendor import six
from kafka.vendor.six.moves import range
_XERIAL_V1_HEADER = (-126, b'S', b'N', b'A', b'P', b'P', b'Y', 0, 1, 1)
_XERIAL_V1_FORMAT = 'bccccccBii'
ZSTD_MAX_OUTPUT_SIZE = 1024 * 1024
try:
import snappy
except ImportError:
snappy = None
try:
import zstandard as zstd
except ImportError:
zstd = None
try:
import lz4.frame as lz4
def _lz4_compress(payload, **kwargs):
# Kafka does not support LZ4 dependent blocks
try:
# For lz4>=0.12.0
kwargs.pop('block_linked', None)
return lz4.compress(payload, block_linked=False, **kwargs)
except TypeError:
# For earlier versions of lz4
kwargs.pop('block_mode', None)
return lz4.compress(payload, block_mode=1, **kwargs)
except ImportError:
lz4 = None
try:
import lz4f
except ImportError:
lz4f = None
try:
import lz4framed
except ImportError:
lz4framed = None
try:
import xxhash
except ImportError:
xxhash = None
PYPY = bool(platform.python_implementation() == 'PyPy')
def has_gzip():
return True
def has_snappy():
return snappy is not None
def has_zstd():
return zstd is not None
def has_lz4():
if lz4 is not None:
return True
if lz4f is not None:
return True
if lz4framed is not None:
return True
return False
def gzip_encode(payload, compresslevel=None):
if not compresslevel:
compresslevel = 9
buf = io.BytesIO()
# Gzip context manager introduced in python 2.7
# so old-fashioned way until we decide to not support 2.6
gzipper = gzip.GzipFile(fileobj=buf, mode="w", compresslevel=compresslevel)
try:
gzipper.write(payload)
finally:
gzipper.close()
return buf.getvalue()
def gzip_decode(payload):
buf = io.BytesIO(payload)
# Gzip context manager introduced in python 2.7
# so old-fashioned way until we decide to not support 2.6
gzipper = gzip.GzipFile(fileobj=buf, mode='r')
try:
return gzipper.read()
finally:
gzipper.close()
def snappy_encode(payload, xerial_compatible=True, xerial_blocksize=32*1024):
"""Encodes the given data with snappy compression.
If xerial_compatible is set then the stream is encoded in a fashion
compatible with the xerial snappy library.
The block size (xerial_blocksize) controls how frequent the blocking occurs
32k is the default in the xerial library.
The format winds up being:
+-------------+------------+--------------+------------+--------------+
| Header | Block1 len | Block1 data | Blockn len | Blockn data |
+-------------+------------+--------------+------------+--------------+
| 16 bytes | BE int32 | snappy bytes | BE int32 | snappy bytes |
+-------------+------------+--------------+------------+--------------+
It is important to note that the blocksize is the amount of uncompressed
data presented to snappy at each block, whereas the blocklen is the number
of bytes that will be present in the stream; so the length will always be
<= blocksize.
"""
if not has_snappy():
raise NotImplementedError("Snappy codec is not available")
if not xerial_compatible:
return snappy.compress(payload)
out = io.BytesIO()
for fmt, dat in zip(_XERIAL_V1_FORMAT, _XERIAL_V1_HEADER):
out.write(struct.pack('!' + fmt, dat))
# Chunk through buffers to avoid creating intermediate slice copies
if PYPY:
# on pypy, snappy.compress() on a sliced buffer consumes the entire
# buffer... likely a python-snappy bug, so just use a slice copy
chunker = lambda payload, i, size: payload[i:size+i]
elif six.PY2:
# Sliced buffer avoids additional copies
# pylint: disable-msg=undefined-variable
chunker = lambda payload, i, size: buffer(payload, i, size)
else:
# snappy.compress does not like raw memoryviews, so we have to convert
# tobytes, which is a copy... oh well. it's the thought that counts.
# pylint: disable-msg=undefined-variable
chunker = lambda payload, i, size: memoryview(payload)[i:size+i].tobytes()
for chunk in (chunker(payload, i, xerial_blocksize)
for i in range(0, len(payload), xerial_blocksize)):
block = snappy.compress(chunk)
block_size = len(block)
out.write(struct.pack('!i', block_size))
out.write(block)
return out.getvalue()
def _detect_xerial_stream(payload):
"""Detects if the data given might have been encoded with the blocking mode
of the xerial snappy library.
This mode writes a magic header of the format:
+--------+--------------+------------+---------+--------+
| Marker | Magic String | Null / Pad | Version | Compat |
+--------+--------------+------------+---------+--------+
| byte | c-string | byte | int32 | int32 |
+--------+--------------+------------+---------+--------+
| -126 | 'SNAPPY' | \0 | | |
+--------+--------------+------------+---------+--------+
The pad appears to be to ensure that SNAPPY is a valid cstring
The version is the version of this format as written by xerial,
in the wild this is currently 1 as such we only support v1.
Compat is there to claim the miniumum supported version that
can read a xerial block stream, presently in the wild this is
1.
"""
if len(payload) > 16:
header = struct.unpack('!' + _XERIAL_V1_FORMAT, bytes(payload)[:16])
return header == _XERIAL_V1_HEADER
return False
def snappy_decode(payload):
if not has_snappy():
raise NotImplementedError("Snappy codec is not available")
if _detect_xerial_stream(payload):
# TODO ? Should become a fileobj ?
out = io.BytesIO()
byt = payload[16:]
length = len(byt)
cursor = 0
while cursor < length:
block_size = struct.unpack_from('!i', byt[cursor:])[0]
# Skip the block size
cursor += 4
end = cursor + block_size
out.write(snappy.decompress(byt[cursor:end]))
cursor = end
out.seek(0)
return out.read()
else:
return snappy.decompress(payload)
if lz4:
lz4_encode = _lz4_compress # pylint: disable-msg=no-member
elif lz4f:
lz4_encode = lz4f.compressFrame # pylint: disable-msg=no-member
elif lz4framed:
lz4_encode = lz4framed.compress # pylint: disable-msg=no-member
else:
lz4_encode = None
def lz4f_decode(payload):
"""Decode payload using interoperable LZ4 framing. Requires Kafka >= 0.10"""
# pylint: disable-msg=no-member
ctx = lz4f.createDecompContext()
data = lz4f.decompressFrame(payload, ctx)
lz4f.freeDecompContext(ctx)
# lz4f python module does not expose how much of the payload was
# actually read if the decompression was only partial.
if data['next'] != 0:
raise RuntimeError('lz4f unable to decompress full payload')
return data['decomp']
if lz4:
lz4_decode = lz4.decompress # pylint: disable-msg=no-member
elif lz4f:
lz4_decode = lz4f_decode
elif lz4framed:
lz4_decode = lz4framed.decompress # pylint: disable-msg=no-member
else:
lz4_decode = None
def lz4_encode_old_kafka(payload):
"""Encode payload for 0.8/0.9 brokers -- requires an incorrect header checksum."""
assert xxhash is not None
data = lz4_encode(payload)
header_size = 7
flg = data[4]
if not isinstance(flg, int):
flg = ord(flg)
content_size_bit = ((flg >> 3) & 1)
if content_size_bit:
# Old kafka does not accept the content-size field
# so we need to discard it and reset the header flag
flg -= 8
data = bytearray(data)
data[4] = flg
data = bytes(data)
payload = data[header_size+8:]
else:
payload = data[header_size:]
# This is the incorrect hc
hc = xxhash.xxh32(data[0:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member
return b''.join([
data[0:header_size-1],
hc,
payload
])
def lz4_decode_old_kafka(payload):
assert xxhash is not None
# Kafka's LZ4 code has a bug in its header checksum implementation
header_size = 7
if isinstance(payload[4], int):
flg = payload[4]
else:
flg = ord(payload[4])
content_size_bit = ((flg >> 3) & 1)
if content_size_bit:
header_size += 8
# This should be the correct hc
hc = xxhash.xxh32(payload[4:header_size-1]).digest()[-2:-1] # pylint: disable-msg=no-member
munged_payload = b''.join([
payload[0:header_size-1],
hc,
payload[header_size:]
])
return lz4_decode(munged_payload)
def zstd_encode(payload):
if not zstd:
raise NotImplementedError("Zstd codec is not available")
return zstd.ZstdCompressor().compress(payload)
def zstd_decode(payload):
if not zstd:
raise NotImplementedError("Zstd codec is not available")
try:
return zstd.ZstdDecompressor().decompress(payload)
except zstd.ZstdError:
return zstd.ZstdDecompressor().decompress(payload, max_output_size=ZSTD_MAX_OUTPUT_SIZE)
This diff is collapsed.
from __future__ import absolute_import
from kafka.consumer.group import KafkaConsumer
__all__ = [
'KafkaConsumer'
]
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment