dbt-selly/dbt-env/lib/python3.8/site-packages/snowplow_tracker/emitters.py

277 lines
10 KiB
Python
Raw Normal View History

2022-03-22 15:13:27 +00:00
"""
emitters.py
Copyright (c) 2013-2014 Snowplow Analytics Ltd. All rights reserved.
This program is licensed to you under the Apache License Version 2.0,
and you may not use this file except in compliance with the Apache License
Version 2.0. You may obtain a copy of the Apache License Version 2.0 at
http://www.apache.org/licenses/LICENSE-2.0.
Unless required by applicable law or agreed to in writing,
software distributed under the Apache License Version 2.0 is distributed on
an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either
express or implied. See the Apache License Version 2.0 for the specific
language governing permissions and limitations there under.
Authors: Anuj More, Alex Dean, Fred Blundun
Copyright: Copyright (c) 2013-2014 Snowplow Analytics Ltd
License: Apache License Version 2.0
"""
import json
import logging
import time
import threading
import requests
from snowplow_tracker.self_describing_json import SelfDescribingJson
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
DEFAULT_MAX_LENGTH = 10
PAYLOAD_DATA_SCHEMA = "iglu:com.snowplowanalytics.snowplow/payload_data/jsonschema/1-0-4"
class Emitter(object):
"""
Synchronously send Snowplow events to a Snowplow collector
Supports both GET and POST requests
"""
def __init__(self, endpoint, protocol="http", port=None, method="get", buffer_size=None, on_success=None, on_failure=None, byte_limit=None):
"""
:param endpoint: The collector URL. Don't include "http://" - this is done automatically.
:type endpoint: string
:param protocol: The protocol to use - http or https. Defaults to http.
:type protocol: protocol
:param port: The collector port to connect to
:type port: int | None
:param method: The HTTP request method
:type method: method
:param buffer_size: The maximum number of queued events before the buffer is flushed. Default is 10.
:type buffer_size: int | None
:param on_success: Callback executed after every HTTP request in a flush has status code 200
Gets passed the number of events flushed.
:type on_success: function | None
:param on_failure: Callback executed if at least one HTTP request in a flush has status code 200
Gets passed two arguments:
1) The number of events which were successfully sent
2) If method is "post": The unsent data in string form;
If method is "get": An array of dictionaries corresponding to the unsent events' payloads
:type on_failure: function | None
:param byte_limit: The size event list after reaching which queued events will be flushed
:type byte_limit: int | None
"""
self.endpoint = Emitter.as_collector_uri(endpoint, protocol, port, method)
self.method = method
if buffer_size is None:
if method == "post":
buffer_size = DEFAULT_MAX_LENGTH
else:
buffer_size = 1
self.buffer_size = buffer_size
self.buffer = []
self.byte_limit = byte_limit
self.bytes_queued = None if byte_limit is None else 0
self.on_success = on_success
self.on_failure = on_failure
self.lock = threading.RLock()
self.timer = None
logger.info("Emitter initialized with endpoint " + self.endpoint)
@staticmethod
def as_collector_uri(endpoint, protocol="http", port=None, method="get"):
"""
:param endpoint: The raw endpoint provided by the user
:type endpoint: string
:param protocol: The protocol to use - http or https
:type protocol: protocol
:param port: The collector port to connect to
:type port: int | None
:param method: Either `get` or `post` HTTP method
:type method: method
:rtype: string
"""
if method == "get":
path = "/i"
else:
path = "/com.snowplowanalytics.snowplow/tp2"
if port is None:
return protocol + "://" + endpoint + path
else:
return protocol + "://" + endpoint + ":" + str(port) + path
def input(self, payload):
"""
Adds an event to the buffer.
If the maximum size has been reached, flushes the buffer.
:param payload: The name-value pairs for the event
:type payload: dict(string:*)
"""
with self.lock:
if self.bytes_queued is not None:
self.bytes_queued += len(str(payload))
if self.method == "post":
self.buffer.append({key: str(payload[key]) for key in payload})
else:
self.buffer.append(payload)
if self.reached_limit():
self.flush()
def reached_limit(self):
"""
Checks if event-size or bytes limit are reached
:rtype: bool
"""
if self.byte_limit is None:
return len(self.buffer) >= self.buffer_size
else:
return self.bytes_queued >= self.byte_limit or len(self.buffer) >= self.buffer_size
def flush(self):
"""
Sends all events in the buffer to the collector.
"""
with self.lock:
self.send_events(self.buffer)
self.buffer = []
if self.bytes_queued is not None:
self.bytes_queued = 0
def http_post(self, data):
"""
:param data: The array of JSONs to be sent
:type data: string
"""
logger.info("Sending POST request to %s..." % self.endpoint)
logger.debug("Payload: %s" % data)
r = requests.post(self.endpoint, data=data, headers={'content-type': 'application/json; charset=utf-8'})
getattr(logger, "info" if self.is_good_status_code(r.status_code) else "warn")("POST request finished with status code: " + str(r.status_code))
return r
def http_get(self, payload):
"""
:param payload: The event properties
:type payload: dict(string:*)
"""
logger.info("Sending GET request to %s..." % self.endpoint)
logger.debug("Payload: %s" % payload)
r = requests.get(self.endpoint, params=payload)
getattr(logger, "info" if self.is_good_status_code(r.status_code) else "warn")("GET request finished with status code: " + str(r.status_code))
return r
def sync_flush(self):
"""
Calls the flush method of the base Emitter class.
This is guaranteed to be blocking, not asynchronous.
"""
logger.debug("Starting synchronous flush...")
Emitter.flush(self)
logger.info("Finished synchrous flush")
@staticmethod
def is_good_status_code(status_code):
"""
:param status_code: HTTP status code
:type status_code: int
:rtype: bool
"""
return 200 <= status_code < 400
def send_events(self, evts):
"""
:param evts: Array of events to be sent
:type evts: list(dict(string:*))
"""
if len(evts) > 0:
logger.info("Attempting to send %s requests" % len(evts))
Emitter.attach_sent_timestamp(evts)
if self.method == 'post':
data = SelfDescribingJson(PAYLOAD_DATA_SCHEMA, evts).to_string()
post_succeeded = False
try:
status_code = self.http_post(data).status_code
post_succeeded = self.is_good_status_code(status_code)
except requests.RequestException as e:
logger.warn(e)
if post_succeeded:
if self.on_success is not None:
self.on_success(len(evts))
elif self.on_failure is not None:
self.on_failure(0, evts)
elif self.method == 'get':
success_count = 0
unsent_requests = []
for evt in evts:
get_succeeded = False
try:
status_code = self.http_get(evt).status_code
get_succeeded = self.is_good_status_code(status_code)
except requests.RequestException as e:
logger.warn(e)
if get_succeeded:
success_count += 1
else:
unsent_requests.append(evt)
if len(unsent_requests) == 0:
if self.on_success is not None:
self.on_success(success_count)
elif self.on_failure is not None:
self.on_failure(success_count, unsent_requests)
else:
logger.info("Skipping flush since buffer is empty")
def set_flush_timer(self, timeout, flush_now=False):
"""
Set an interval at which the buffer will be flushed
:param timeout: interval in seconds
:type timeout: int | float
:param flush_now: immediately flush buffer
:type flush_now: bool
"""
# Repeatable create new timer
if flush_now:
self.flush()
self.timer = threading.Timer(timeout, self.set_flush_timer, [timeout, True])
self.timer.daemon = True
self.timer.start()
def cancel_flush_timer(self):
"""
Abort automatic async flushing
"""
if self.timer is not None:
self.timer.cancel()
@staticmethod
def attach_sent_timestamp(events):
"""
Attach (by mutating in-place) current timestamp in milliseconds
as `stm` param
:param events: Array of events to be sent
:type events: list(dict(string:*))
:rtype: None
"""
def update(e):
e.update({'stm': str(int(time.time()) * 1000)})
[update(event) for event in events]