277 lines
10 KiB
Python
277 lines
10 KiB
Python
|
"""
|
||
|
emitters.py
|
||
|
|
||
|
Copyright (c) 2013-2014 Snowplow Analytics Ltd. All rights reserved.
|
||
|
|
||
|
This program is licensed to you under the Apache License Version 2.0,
|
||
|
and you may not use this file except in compliance with the Apache License
|
||
|
Version 2.0. You may obtain a copy of the Apache License Version 2.0 at
|
||
|
http://www.apache.org/licenses/LICENSE-2.0.
|
||
|
|
||
|
Unless required by applicable law or agreed to in writing,
|
||
|
software distributed under the Apache License Version 2.0 is distributed on
|
||
|
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
|
||
|
express or implied. See the Apache License Version 2.0 for the specific
|
||
|
language governing permissions and limitations there under.
|
||
|
|
||
|
Authors: Anuj More, Alex Dean, Fred Blundun
|
||
|
Copyright: Copyright (c) 2013-2014 Snowplow Analytics Ltd
|
||
|
License: Apache License Version 2.0
|
||
|
"""
|
||
|
|
||
|
import json
|
||
|
import logging
|
||
|
import time
|
||
|
import threading
|
||
|
|
||
|
import requests
|
||
|
|
||
|
from snowplow_tracker.self_describing_json import SelfDescribingJson
|
||
|
|
||
|
logger = logging.getLogger(__name__)
|
||
|
logger.setLevel(logging.INFO)
|
||
|
|
||
|
DEFAULT_MAX_LENGTH = 10
|
||
|
PAYLOAD_DATA_SCHEMA = "iglu:com.snowplowanalytics.snowplow/payload_data/jsonschema/1-0-4"
|
||
|
|
||
|
|
||
|
class Emitter(object):
|
||
|
"""
|
||
|
Synchronously send Snowplow events to a Snowplow collector
|
||
|
Supports both GET and POST requests
|
||
|
"""
|
||
|
|
||
|
def __init__(self, endpoint, protocol="http", port=None, method="get", buffer_size=None, on_success=None, on_failure=None, byte_limit=None):
|
||
|
"""
|
||
|
:param endpoint: The collector URL. Don't include "http://" - this is done automatically.
|
||
|
:type endpoint: string
|
||
|
:param protocol: The protocol to use - http or https. Defaults to http.
|
||
|
:type protocol: protocol
|
||
|
:param port: The collector port to connect to
|
||
|
:type port: int | None
|
||
|
:param method: The HTTP request method
|
||
|
:type method: method
|
||
|
:param buffer_size: The maximum number of queued events before the buffer is flushed. Default is 10.
|
||
|
:type buffer_size: int | None
|
||
|
:param on_success: Callback executed after every HTTP request in a flush has status code 200
|
||
|
Gets passed the number of events flushed.
|
||
|
:type on_success: function | None
|
||
|
:param on_failure: Callback executed if at least one HTTP request in a flush has status code 200
|
||
|
Gets passed two arguments:
|
||
|
1) The number of events which were successfully sent
|
||
|
2) If method is "post": The unsent data in string form;
|
||
|
If method is "get": An array of dictionaries corresponding to the unsent events' payloads
|
||
|
:type on_failure: function | None
|
||
|
:param byte_limit: The size event list after reaching which queued events will be flushed
|
||
|
:type byte_limit: int | None
|
||
|
"""
|
||
|
self.endpoint = Emitter.as_collector_uri(endpoint, protocol, port, method)
|
||
|
|
||
|
self.method = method
|
||
|
|
||
|
if buffer_size is None:
|
||
|
if method == "post":
|
||
|
buffer_size = DEFAULT_MAX_LENGTH
|
||
|
else:
|
||
|
buffer_size = 1
|
||
|
self.buffer_size = buffer_size
|
||
|
self.buffer = []
|
||
|
self.byte_limit = byte_limit
|
||
|
self.bytes_queued = None if byte_limit is None else 0
|
||
|
|
||
|
self.on_success = on_success
|
||
|
self.on_failure = on_failure
|
||
|
|
||
|
self.lock = threading.RLock()
|
||
|
|
||
|
self.timer = None
|
||
|
|
||
|
logger.info("Emitter initialized with endpoint " + self.endpoint)
|
||
|
|
||
|
@staticmethod
|
||
|
def as_collector_uri(endpoint, protocol="http", port=None, method="get"):
|
||
|
"""
|
||
|
:param endpoint: The raw endpoint provided by the user
|
||
|
:type endpoint: string
|
||
|
:param protocol: The protocol to use - http or https
|
||
|
:type protocol: protocol
|
||
|
:param port: The collector port to connect to
|
||
|
:type port: int | None
|
||
|
:param method: Either `get` or `post` HTTP method
|
||
|
:type method: method
|
||
|
:rtype: string
|
||
|
"""
|
||
|
if method == "get":
|
||
|
path = "/i"
|
||
|
else:
|
||
|
path = "/com.snowplowanalytics.snowplow/tp2"
|
||
|
if port is None:
|
||
|
return protocol + "://" + endpoint + path
|
||
|
else:
|
||
|
return protocol + "://" + endpoint + ":" + str(port) + path
|
||
|
|
||
|
def input(self, payload):
|
||
|
"""
|
||
|
Adds an event to the buffer.
|
||
|
If the maximum size has been reached, flushes the buffer.
|
||
|
|
||
|
:param payload: The name-value pairs for the event
|
||
|
:type payload: dict(string:*)
|
||
|
"""
|
||
|
with self.lock:
|
||
|
if self.bytes_queued is not None:
|
||
|
self.bytes_queued += len(str(payload))
|
||
|
|
||
|
if self.method == "post":
|
||
|
self.buffer.append({key: str(payload[key]) for key in payload})
|
||
|
else:
|
||
|
self.buffer.append(payload)
|
||
|
|
||
|
if self.reached_limit():
|
||
|
self.flush()
|
||
|
|
||
|
def reached_limit(self):
|
||
|
"""
|
||
|
Checks if event-size or bytes limit are reached
|
||
|
|
||
|
:rtype: bool
|
||
|
"""
|
||
|
if self.byte_limit is None:
|
||
|
return len(self.buffer) >= self.buffer_size
|
||
|
else:
|
||
|
return self.bytes_queued >= self.byte_limit or len(self.buffer) >= self.buffer_size
|
||
|
|
||
|
def flush(self):
|
||
|
"""
|
||
|
Sends all events in the buffer to the collector.
|
||
|
"""
|
||
|
with self.lock:
|
||
|
self.send_events(self.buffer)
|
||
|
self.buffer = []
|
||
|
if self.bytes_queued is not None:
|
||
|
self.bytes_queued = 0
|
||
|
|
||
|
def http_post(self, data):
|
||
|
"""
|
||
|
:param data: The array of JSONs to be sent
|
||
|
:type data: string
|
||
|
"""
|
||
|
logger.info("Sending POST request to %s..." % self.endpoint)
|
||
|
logger.debug("Payload: %s" % data)
|
||
|
r = requests.post(self.endpoint, data=data, headers={'content-type': 'application/json; charset=utf-8'})
|
||
|
getattr(logger, "info" if self.is_good_status_code(r.status_code) else "warn")("POST request finished with status code: " + str(r.status_code))
|
||
|
return r
|
||
|
|
||
|
def http_get(self, payload):
|
||
|
"""
|
||
|
:param payload: The event properties
|
||
|
:type payload: dict(string:*)
|
||
|
"""
|
||
|
logger.info("Sending GET request to %s..." % self.endpoint)
|
||
|
logger.debug("Payload: %s" % payload)
|
||
|
r = requests.get(self.endpoint, params=payload)
|
||
|
getattr(logger, "info" if self.is_good_status_code(r.status_code) else "warn")("GET request finished with status code: " + str(r.status_code))
|
||
|
return r
|
||
|
|
||
|
def sync_flush(self):
|
||
|
"""
|
||
|
Calls the flush method of the base Emitter class.
|
||
|
This is guaranteed to be blocking, not asynchronous.
|
||
|
"""
|
||
|
logger.debug("Starting synchronous flush...")
|
||
|
Emitter.flush(self)
|
||
|
logger.info("Finished synchrous flush")
|
||
|
|
||
|
@staticmethod
|
||
|
def is_good_status_code(status_code):
|
||
|
"""
|
||
|
:param status_code: HTTP status code
|
||
|
:type status_code: int
|
||
|
:rtype: bool
|
||
|
"""
|
||
|
return 200 <= status_code < 400
|
||
|
|
||
|
def send_events(self, evts):
|
||
|
"""
|
||
|
:param evts: Array of events to be sent
|
||
|
:type evts: list(dict(string:*))
|
||
|
"""
|
||
|
if len(evts) > 0:
|
||
|
logger.info("Attempting to send %s requests" % len(evts))
|
||
|
Emitter.attach_sent_timestamp(evts)
|
||
|
if self.method == 'post':
|
||
|
data = SelfDescribingJson(PAYLOAD_DATA_SCHEMA, evts).to_string()
|
||
|
post_succeeded = False
|
||
|
try:
|
||
|
status_code = self.http_post(data).status_code
|
||
|
post_succeeded = self.is_good_status_code(status_code)
|
||
|
except requests.RequestException as e:
|
||
|
logger.warn(e)
|
||
|
if post_succeeded:
|
||
|
if self.on_success is not None:
|
||
|
self.on_success(len(evts))
|
||
|
elif self.on_failure is not None:
|
||
|
self.on_failure(0, evts)
|
||
|
|
||
|
elif self.method == 'get':
|
||
|
success_count = 0
|
||
|
unsent_requests = []
|
||
|
for evt in evts:
|
||
|
get_succeeded = False
|
||
|
try:
|
||
|
status_code = self.http_get(evt).status_code
|
||
|
get_succeeded = self.is_good_status_code(status_code)
|
||
|
except requests.RequestException as e:
|
||
|
logger.warn(e)
|
||
|
if get_succeeded:
|
||
|
success_count += 1
|
||
|
else:
|
||
|
unsent_requests.append(evt)
|
||
|
if len(unsent_requests) == 0:
|
||
|
if self.on_success is not None:
|
||
|
self.on_success(success_count)
|
||
|
elif self.on_failure is not None:
|
||
|
self.on_failure(success_count, unsent_requests)
|
||
|
else:
|
||
|
logger.info("Skipping flush since buffer is empty")
|
||
|
|
||
|
def set_flush_timer(self, timeout, flush_now=False):
|
||
|
"""
|
||
|
Set an interval at which the buffer will be flushed
|
||
|
|
||
|
:param timeout: interval in seconds
|
||
|
:type timeout: int | float
|
||
|
:param flush_now: immediately flush buffer
|
||
|
:type flush_now: bool
|
||
|
"""
|
||
|
|
||
|
# Repeatable create new timer
|
||
|
if flush_now:
|
||
|
self.flush()
|
||
|
self.timer = threading.Timer(timeout, self.set_flush_timer, [timeout, True])
|
||
|
self.timer.daemon = True
|
||
|
self.timer.start()
|
||
|
|
||
|
def cancel_flush_timer(self):
|
||
|
"""
|
||
|
Abort automatic async flushing
|
||
|
"""
|
||
|
|
||
|
if self.timer is not None:
|
||
|
self.timer.cancel()
|
||
|
|
||
|
@staticmethod
|
||
|
def attach_sent_timestamp(events):
|
||
|
"""
|
||
|
Attach (by mutating in-place) current timestamp in milliseconds
|
||
|
as `stm` param
|
||
|
|
||
|
:param events: Array of events to be sent
|
||
|
:type events: list(dict(string:*))
|
||
|
:rtype: None
|
||
|
"""
|
||
|
def update(e):
|
||
|
e.update({'stm': str(int(time.time()) * 1000)})
|
||
|
|
||
|
[update(event) for event in events]
|