Server : Apache System : Linux indy02.toastserver.com 3.10.0-962.3.2.lve1.5.85.el7.x86_64 #1 SMP Thu Apr 18 15:18:36 UTC 2024 x86_64 User : palandch ( 1163) PHP Version : 7.1.33 Disable Function : NONE Directory : /opt/cloudlinux/venv/lib64/python3.11/site-packages/xray/agent/ |
# -*- coding: utf-8 -*- # Copyright © Cloud Linux GmbH & Cloud Linux Software, Inc 2010-2021 All Rights Reserved # # Licensed under CLOUD LINUX LICENSE AGREEMENT # http://cloudlinux.com/docs/LICENSE.TXT """ This module contains contains classes implementing X-Ray Agent behaviour """ import io import json import logging import os import queue import re import signal import socket import subprocess import time import typing from threading import Thread, current_thread, Lock from typing import Any, Optional, Dict, Tuple from dataclasses import dataclass import psutil from .executor import BoundedThreadExecutor from xray import gettext as _ from xray.apiclient import get_client from xray.internal.constants import local_tasks_storage from xray.internal.exceptions import XRayError, XRayAPIError from .fault_detector import FaultDetector from xray.internal.local_counters import ( open_local_storage, flush_memory_storage, get_task_ids ) from xray.internal.types import Task from xray.internal.user_plugin_utils import extract_creds from xray.internal.utils import ( dbm_storage, get_current_cpu_throttling_time ) if typing.TYPE_CHECKING: from xray.apiclient.api_client import ( SendClient, SmartAdviceAPIClient, APIClient ) @dataclass class APIDataContainer: client: 'APIClient' task: Task logger = logging.getLogger(__name__) class Agent: """ X-Ray Agent class """ COUNTERS_FLUSH_INTERVAL = 15 MONGO_FLUSH_INTERVAL = 60 CLEANUP_INTERVAL = 43200 # once in 12 hours def __init__(self, system_id, # keep max_connections quite big to handle spikes max_connections=psutil.cpu_count() * 8, # max_workers can also be quite big because they are not cpu-bound max_workers=psutil.cpu_count() * 4, maxqueuesize=psutil.cpu_count() * 16): self.sys_id = system_id self.maxqueuesize = maxqueuesize self.max_connections = max_connections self.max_workers = max_workers # don't process SIGUSR2 with default handler signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGUSR2}) # initialize ClickHouse API client to send requests data clickhouse_client_object: typing.Type[SendClient] = get_client('api_req') self.send_client: SendClient = clickhouse_client_object(system_id=self.sys_id) # initialize Adviser API client to send requests data adviser_client_object: typing.Type[SmartAdviceAPIClient] = get_client('adviser') self.adviser_client: SmartAdviceAPIClient = adviser_client_object() # initial state of MongoDB API client to interact with tasks self.task_client_object: typing.Type[APIClient] = get_client() # initialize storage for cache of remote API data self.api_data_cache_lock = Lock() self.api_data_cache: Dict[str, APIDataContainer] = dict() # initialize Fault Detector self.fault_detector = FaultDetector() self.signal_handler_thread: Optional[Thread] = None self.flusher_thread: Optional[Thread] = None def _wait_for_sigusr2(self): siginfo = signal.sigwaitinfo({signal.SIGUSR2}) logging.info('Received SIGUSR2 from pid=%s, ' 'flushing database storage on disk', siginfo.si_pid) self._flush_mongodb_counters() flush_memory_storage() logging.info('Sending signal back to process that requested storage flush') try: os.kill(siginfo.si_pid, signal.SIGUSR2) except OSError: logging.warning('Process that requested storage flush no longer exists') def _setup_signal_handler(self): """ Setup SIGUSR2 handler that starts in-memory storage flush when received. When flushed, send SIGUSR2 back to the process that send signal. """ signal.pthread_sigmask(signal.SIG_BLOCK, {signal.SIGUSR2}) while True: try: self._wait_for_sigusr2() except Exception: logging.exception('Unable to process signal, see traceback for details.') def _flush_mongodb_counters(self, task_id=None): tasks_to_flush = [task_id] if task_id is not None else get_task_ids() for task_id in tasks_to_flush: logger.info('Updating task requests counters in mongo for task_id=%s', task_id) try: apiclient, task = self.get_cached_or_load(task_id) except XRayError: logging.warning('Unable to get client and task %s', task_id) continue # read stored request_id with open_local_storage(task_id) as storage: task.update_with_local_data(next_request_id=storage.next_request_id) if task.tracing_by == 'time': # tracing_count for task by time represents number of minutes # left to active tracing and is updated by stop|continue # task routines only self.update_counts(apiclient, task.request_count) else: # tracing_count for task by request_qty depends on number of # collected requests, thus should be updated alongside self.update_counts(apiclient, task.request_count, task.tracing_count) def _flusher(self): """ This method flushes data from memory to local storage periodically. """ last_mongo_flush_time = 0 last_api_data_cache_cleanup = 0 while True: time.sleep(self.COUNTERS_FLUSH_INTERVAL) if time.time() - last_mongo_flush_time > self.MONGO_FLUSH_INTERVAL: self._flush_mongodb_counters() flush_memory_storage() last_mongo_flush_time = time.time() # we should cleanup API data cache only after flushing counters # in order not to lose counters for already inactive tasks if time.time() - last_api_data_cache_cleanup > self.CLEANUP_INTERVAL: self.cleanup_api_data_cache() last_api_data_cache_cleanup = time.time() else: flush_memory_storage(remove=False) def start(self, sock: socket.socket, background_routine: bool = False, loops: Optional[int] = None) -> None: """ Start listening socket """ logger.info('Starting daemon') if background_routine: # setup signal handlers self.signal_handler_thread = Thread(target=self._setup_signal_handler) self.signal_handler_thread.start() # start periodical database flushing self.flusher_thread = Thread(target=self._flusher) self.flusher_thread.start() with BoundedThreadExecutor( max_workers=self.max_workers, maxqueuesize=self.maxqueuesize) as workers_pool, \ BoundedThreadExecutor( max_workers=self.max_connections, # turn off the queue for connections because we # don't want the php processes to wait in queue # and slow down the php processes maxqueuesize=0 ) as connections_pool: while loops if loops is not None else True: connection, address = sock.accept() try: connections_pool.submit( self.handle_incoming_connection, connection, workers_pool) except queue.Full: logger.error('Request %s was rejected because our connections thread pool ' 'is full of tasks. Increase max_connections in configuration.') connection.close() if loops is not None: loops -= 1 def add_limit_faults(self, data, t_key, cpu_value): """ Calculate throttling values and update given data with resulting throttling stat """ data['hitting_limits'], data['throttled_time'] = self.fault_detector( t_key, cpu_value) def _handle_request_init(self, php_pid: int, cpu_usage: int): """ Called when php request starts and sends us welcome request meaning that request started on the php side. """ logger.info('Received request init trigger from php=%s with cpu_usage=%s', php_pid, cpu_usage) # save current CPU throttling time and timestamp self.fault_detector.save(php_pid, cpu_usage) # attempt to flush expired entries self.fault_detector.flush() def _handle_request_end(self, php_pid: int, cpu_usage: int, request_data: dict): # otherwise calculate throttling fact, add it to data # and send gathered stat to CH if request_data.get('hitting_limits') is None: # only calculate faults if extension failed to get them itself self.add_limit_faults(request_data, php_pid, cpu_usage) logger.info('[%s] Processing trace for task %s (%s)', current_thread().name, request_data.get('tracing_task_id'), request_data.get('url')) self.process_request_data(request_data) def handle_incoming_connection(self, connection: socket.socket, workers_pool: BoundedThreadExecutor) -> None: """ Handle incoming connection :param connection: socket object usable to send and receive data on the connection :param workers_pool: pool where we can place tasks for the futher processing """ _pid, _uid, _gid = extract_creds(connection) current_cpu = get_current_cpu_throttling_time(_uid) fileobj: io.TextIOBase = connection.makefile(errors='ignore') try: input_data = self.read_input(fileobj) except json.JSONDecodeError as e: logger.error('JSON decode failed: %s', str(e), extra={'t_name': current_thread().name}) return finally: # close connection as soon as possible to # allow new clients to be connected connection.close() try: # continue in different pool and threads because we don't want # our php processes to wait until processing is complete if input_data is None: workers_pool.submit(self._handle_request_init, php_pid=_pid, cpu_usage=current_cpu) else: workers_pool.submit(self._handle_request_end, php_pid=_pid, cpu_usage=current_cpu, request_data=input_data) except queue.Full: logger.error('Request %s was rejected because our workers thread pool ' 'is full of tasks. Increase queuemaxsize or max_threads in configuration.') def read_input(self, fileio: io.TextIOBase) -> Any: """ Read input data and return decoded json :param fileio: a file-like object providing read method """ data = fileio.read() logger.debug('I received data: %s', data) if len(data.strip()) == 0: return return json.loads(data.strip(), strict=False) def instantiate_mongo_client(self, fake_task_id: str) -> 'APIClient': """ Initialize MongoDB client for current task """ try: with dbm_storage(local_tasks_storage) as task_storage: try: real_id = task_storage[fake_task_id].decode() except KeyError: raise XRayError( _("Cannot resolve tracing_task_id: no match found in storage"), extra={'id': fake_task_id, 'all_ids': task_storage.keys()}) except RuntimeError as e: raise XRayError(_("Cannot resolve tracing_task_id: %s") % str(e)) return self.task_client_object(system_id=self.sys_id, tracing_task_id=real_id) def get_cached_or_load(self, fake_task_id: str) -> Tuple['APIClient', Task]: """ Returns a client and task from cache of API data or initialize client and GET task from MongoDB and add to cache """ logger.debug('Cached API data: %s', self.api_data_cache) cached_data = self.api_data_cache.get(fake_task_id) if cached_data is not None: return cached_data.client, cached_data.task apiclient = self.instantiate_mongo_client(fake_task_id) _t = apiclient.get_task() logger.debug('Adding new container in cache: %s --> %s, %s', fake_task_id, _t, apiclient) self.api_data_cache[fake_task_id] = APIDataContainer(client=apiclient, task=_t) return apiclient, _t def cleanup_api_data_cache(self) -> None: """ Cleanup an API data im-memory cache dict in order not store inactive (stopped, already completed) tasks there """ try: with dbm_storage(local_tasks_storage) as task_storage: active_tasks = [k.decode() for k in task_storage.keys()] except RuntimeError: logger.warning( 'Unable to cleanup cache, storage unavailable') return for _task in list(self.api_data_cache.keys()): with self.api_data_cache_lock: if _task in self.api_data_cache and _task not in active_tasks: logger.info('Cleaning up inactive container %s', _task) self.api_data_cache.pop(_task) def process_request_data(self, request_data: dict) -> None: """ Increment request ID in /usr/share/alt-php-xray/requests/{tracing_task_id} file Substitute request_id and tracing_task_id in request_data. Send request_data to ClickHouse :param request_data: original request data """ _, task = self.get_cached_or_load(request_data['tracing_task_id']) logger.info('Processing task: %s', task.task_id) with open_local_storage(request_data['tracing_task_id'], flush=task.is_manual) as storage: # read stored request_id task.update_with_local_data(next_request_id=storage.next_request_id) if task.tracing_count <= 0: logger.info('Tracing count is 0, nothing should be done') return # update input data with stored request_id updated_request_data = self.update_request_data(request_data, task) # send data with updated ids logger.info('Sending to ClickHouse') self.send_client(updated_request_data) try: logger.info('Sending to SmartAdvice') self.adviser_client(updated_request_data) except XRayAPIError: # ignore all errors occurring within smart advice # microservice intercommunication pass # then increment request_id counter storage.next_request_id += 1 # locally recalculate how much requests left to process task.update_with_local_data(next_request_id=storage.next_request_id) if task.is_manual: self._flush_mongodb_counters(task.fake_id) if task.tracing_by != 'time' and task.tracing_count <= 0: self.complete_task(task) def update_request_data(self, data: dict, task: Task) -> dict: """ Substitute request_id and tracing_task_id :param data: original input :param task: a Task instance :return: updated input """ data['request_id'] = task.request_count + 1 data['tracing_task_id'] = task.task_id for item in data['data']: item['request_id'] = task.request_count + 1 item['tracing_task_id'] = task.task_id if item['type'] == 'mysql_query': item['query'] = self.hide_symbols(item['query']) logger.info('Input updated: tracing_task_id = %s & request_id = %s', data.get('tracing_task_id'), data.get('request_id')) logger.debug('Full updated input %s', data) return data def update_counts(self, client: 'APIClient instance', request_count: int, tracing_count: Optional[int] = None) -> None: """ Update task counters in mongodb instance """ client.update_counts_only(tracing_count=tracing_count, request_count=request_count) def complete_task(self, _task: Task) -> None: """ Stop and complete request_qty task :param _task: tracing task to stop """ logger.info('Task %s should be completed', _task.task_id) # delay for MongoDB to process counts, received lately (see XRAY-87) time.sleep(1) self._run_complete_task_cmd(_task.task_id) def _run_complete_task_cmd(self, task_id): subprocess.check_output([ 'cloudlinux-xray-manager', 'stop', '--system_id', self.sys_id, '--tracing_task_id', task_id ]) @staticmethod def hide_symbols(mysql_query: str) -> str: """ Sanitize data in single quotes from MySQL query """ def replacer(m): """ Works with whole string in single or double quotes """ q = m.group('quote') t = m.group('trunc') def inner_repl(inner_m): """ Works with characters inside quotes """ if inner_m.group('digit'): return '0' elif inner_m.group('symbol'): return 'x' sanitized = re.sub(r"((?P<digit>\d)|(?P<symbol>[^0-9_:;\-/',. \\]))", inner_repl, m.group('in_quote')) # wrap sanitized string back with originally detected characters # (quotes/truncation marker) return f'{q}{sanitized}{t or q}' # string either wrapped in quotes (single or double) or # starting from quote and finishing with ... (truncation marker) # including escaped with either / or \ quote pattern = re.compile(r"""(?P<quote>['"])(?P<in_quote>.*?)((?<![\\|/])(?P=quote)|(?P<trunc>\.{3}))""") return re.sub(pattern, replacer, mysql_query)