The Joint Accelerator Conferences Website (JACoW) is an international collaboration that publishes the proceedings of accelerator conferences held around the world.
@InProceedings{vino:icalepcs2019-tudpp01,
author = {G. Vino and V. Chibante Barroso and D. Elia and A. Wegrzynek},
title = {{A Monitoring System for the New ALICE O2 Farm}},
booktitle = {Proc. ICALEPCS'19},
pages = {835--840},
paper = {TUDPP01},
language = {english},
keywords = {monitoring, detector, network, database, controls},
venue = {New York, NY, USA},
series = {International Conference on Accelerator and Large Experimental Physics Control Systems},
number = {17},
publisher = {JACoW Publishing, Geneva, Switzerland},
month = {08},
year = {2020},
issn = {2226-0358},
isbn = {978-3-95450-209-7},
doi = {10.18429/JACoW-ICALEPCS2019-TUDPP01},
url = {https://jacow.org/icalepcs2019/papers/tudpp01.pdf},
note = {https://doi.org/10.18429/JACoW-ICALEPCS2019-TUDPP01},
abstract = {The ALICE Experiment has been designed to study the physics of strongly interacting matter with heavy-ion collisions at the CERN LHC. A major upgrade of the detector and computing model (O2, Offline-Online) is currently ongoing. The ALICE O2 farm will consist of almost 1000 nodes enabled to readout and process on-the-fly about 27 Tb/s of raw data. To increase the efficiency of computing farm operations a general-purpose near real-time monitoring system has been developed: it lays on features like high-performance, high-availability, modularity, and open source. The core component (Apache Kafka) ensures high throughput, data pipelines, and fault-tolerant services. Additional monitoring functionality is based on Telegraf as metric collector, Apache Spark for complex aggregation, InfluxDB as time-series database, and Grafana as visualization tool. A logging service based on Elasticsearch stack is also included. The designed system handles metrics coming from operating system, network, custom hardware, and in-house software. A prototype version is currently running at CERN and has been also successfully deployed by the ReCaS Datacenter at INFN Bari for both monitoring and logging.},
}