% % GENERATED FROM http://acme.able.cs.cmu.edu % by : anonymous % IP : ec2-54-85-255-74.compute-1.amazonaws.com % at : Mon, 18 Mar 2024 22:53:46 -0400 GMT % % Selection : Author: Maria_Casimiro % @InProceedings{2020:ICDS:Lynceus, AUTHOR = {Casimiro, Maria and Didona, Diego and Romano, Paolo and Rodrigues, Luis and Zwaenepoel, Willy and Garlan, David}, TITLE = {Lynceus: Cost-efficient Tuning and Provisioning of Data Analytic Jobs}, YEAR = {2020}, MONTH = {8-10 July}, BOOKTITLE = {The 40th International Conference on Distributed Computing Systems}, ADDRESS = {Singapore}, PDF = {http://acme.able.cs.cmu.edu/pubs/uploads/pdf/ICDCS_2020_paper_2972020_ICDS_Lynceus.pdf}, ABSTRACT = {Modern data analytic and machine learning jobs find in the cloud a natural deployment platform to satisfy their notoriously large resource requirements. Yet, to achieve cost efficiency, it is crucial to identify a deployment configuration that satisfies user-defined QoS constraints (e.g., on execution time), while avoiding unnecessary over-provisioning. This paper introduces Lynceus, a new approach for the optimization of cloud-based data analytic jobs that improves over state-of-the-art approaches by enabling significant cost savings both in terms of the final recommended configuration and of the optimization process used to recommend configurations. Unlike existing solutions, Lynceus optimizes in a joint fashion both the cloud-related (i.e., which and how many machines to provision) and the application-level (e.g. the hyper-parameters of a machine learning algorithm) parameters. This allows for a reduction of the cost of recommended configurations by up to 3.7X at the 90-th percentile with respect to existing approaches, which treat the optimization of cloud-related and application level parameters as two independent problems. Further, Lynceus reduces the cost of the optimization process (i.e., the cloud cost incurred for testing configurations) by up to 11X. Such an improvement is achieved thanks to two mechanisms: i) a timeout approach which allows to abort the exploration of configurations that are deemed suboptimal, while still extracting useful information to guide future explorations and to improve its predictive model — differently from recent works, which either incur the full cost for testing suboptimal configurations or are unable to extract any knowledge from aborted runs; ii) a long-sighted and budget-aware technique that determines which configurations to test by predicting the long-term impact of each exploration — unlike state-of-the-art approaches for the optimization of cloud jobs, which adopt greedy optimization methods.}, KEYWORDS = {Big data, Machine Learning, Resource Allocation} } @InProceedings{2020:MASCOTS, AUTHOR = {Mendes, Pedro and Casimiro, Maria and Romano, Paolo and Garlan, David}, TITLE = {TrimTuner: Efficient Optimization of MachineLearning Jobs in the Cloud via Sub-Sampling}, YEAR = {2020}, BOOKTITLE = {Proceedings of the 2020 Symposium on Modelling, Analysis, and Simulation of Computer and Telecommunication Systems (MASCOTS 2020)}, PDF = {http://acme.able.cs.cmu.edu/pubs/uploads/pdf/TrimTuner-MASCOTS-20.pdf}, ABSTRACT = {This work introduces TrimTuner, the first system for optimizing machine learning jobs in the cloud to exploit sub-sampling techniques to reduce the cost of the optimization process, while keeping into account user-specified constraints. TrimTuner jointly optimizes the cloud and application-specific parameters and, unlike state of the art works for cloud optimization, eschews the need to train the model with the full training set every time a new configuration is sampled. Indeed, by leveraging sub-sampling techniques and data-sets that are up to 60x smaller than the original one, we show that TrimTuner can reduce the cost of the optimization process by up to 50x. Further, TrimTuner speeds-up the recommendation process by 65×with respect to state of the art techniques for hyper-parameter optimization that use sub-sampling techniques. The reasons for this improvement are twofold: i) a novel domain specific heuristic that reduces the number of configurations for which the acquisition function has to be evaluated; ii) the adoption of an ensemble of decision trees that enables boosting the speed of the recommendation process by one additional order of magnitude.}, KEYWORDS = {Machine Learning} } @InProceedings{SAML21, AUTHOR = {Casimiro, Maria and Romano, Paolo and Garlan, David and Moreno, Gabriel A. and Kang, Eunsuk and Klein, Mark}, TITLE = {Self-Adaptation for Machine Learning Based Systems}, YEAR = {2021}, MONTH = {14 September}, BOOKTITLE = {Proceedings of the 1st International Workshop on Software Architecture and Machine Learning (SAML)}, SERIES = {LNCS}, ADDRESS = {Virtual, (Originally V\"axjö, Sweden)}, PUBLISHER = {Springer}, PDF = {http://acme.able.cs.cmu.edu/pubs/uploads/pdf/SAML2021-paper6SAML21.pdf}, ABSTRACT = {Today’s world is witnessing a shift from human-written software to machine-learned software, with the rise of systems that rely on machine learning. These systems typically operate in non-static environments, which are prone to unexpected changes, as is the case of self-driving cars and enterprise systems. In this context, machine-learned software can misbehave. Thus, it is paramount that these systems are capable of detecting problems with their machined-learned components and adapt themselves to maintain desired qualities. For instance, a fraud detection system that cannot adapt its machine-learned model to efficiently cope with emerging fraud patterns or changes in the volume of transactions is subject to losses of millions of dollars. In this paper, we take a first step towards the development of a framework aimed to self-adapt systems that rely on machine-learned components. We describe: (i) a set of causes of machine-learned component misbehavior and a set of adaptation tactics inspired by the literature on machine learning, motivating them with the aid of a running example; (ii) the required changes to the MAPE-K loop, a popular control loop for self-adaptive systems; and (iii) the challenges associated with developing this framework. We conclude the paper with a set of research questions to guide future work.}, KEYWORDS = {Machine Learning, Self-adaptation} } @InProceedings{ASDYE2021, AUTHOR = {Casimiro, Maria and Garlan, David and C\'{a}mara, Javier and Rodrigues, Luis and Romano, Paolo}, TITLE = {A Probabilistic Model Checking Approach to Self-Adapting Machine Learning Systems}, YEAR = {2021}, MONTH = {6 December}, BOOKTITLE = {Proceedings of the Third International Workshop on Automated and verifiable Software sYstem DEvelopment (ASYDE)}, PDF = {http://acme.able.cs.cmu.edu/pubs/uploads/pdf/ASYDE-CR.pdf}, ABSTRACT = {Machine Learning (ML) is increasingly used in domains such as cyber-physical systems and enterprise systems. These systems typically operate in non-static environments, prone to unpredictable changes that can adversely impact the accuracy of the ML models, which are usually in the critical path of the systems. Mispredictions of ML components can thus affect other components in the system, and ultimately impact overall system utility in non-trivial ways. From this perspective, self-adaptation techniques appear as a natural solution to reason about how to react to environment changes via adaptation tactics that can potentially improve the quality of ML models (e.g., model retrain), and ultimately maximize system utility. However, adapting ML components is non-trivial, since adaptation tactics have costs and it may not be clear in a given context whether the benefits of ML adaptation outweigh its costs. In this paper, we present a formal probabilistic framework, based on model checking, that incorporates the essential governing factors for reasoning at an architectural level about adapting ML classifiers in a system context. The proposed framework can be used in a self-adaptive system to create adaptation strategies that maximize rewards of a multidimensional utility space. Resorting to a running example from the enterprise systems domain, we show how the proposed framework can be employed to determine the gains achievable via ML adaptation and to find the boundary that renders adaptation worthwhile.}, KEYWORDS = {Machine Learning, Model Checking, Self-adaptation} } @InProceedings{Casimiro:ACSOS:2022, AUTHOR = {Casimiro, Maria and Romano, Paolo and Garlan, David and Rodrigues, Luis}, TITLE = {Towards a Framework for Adapting Machine Learning Components}, YEAR = {2022}, BOOKTITLE = {2022 IEEE International Conference on Autonomic Computing and Self-Organizing Systems (ACSOS)}, PDF = {http://acme.able.cs.cmu.edu/pubs/uploads/pdf/ACSOS2022.pdf}, ABSTRACT = {Machine Learning (ML) models are now commonly used as components in systems. As any other component, ML components can produce erroneous outputs that may penalize system utility. In this context, self-adaptive systems emerge as a natural approach to cope with ML mispredictions, through the execution of adaptation tactics such as model retraining. To synthesize an adaptation strategy, the self-adaptation manager needs to reason about the cost-benefit tradeoffs of the applicable tactics, which is a non-trivial task for tactics such as model retraining, whose benefits are both context- and data-dependent. To address this challenge, this paper proposes a probabilistic modeling framework that supports automated reasoning about the cost/benefit tradeoffs associated with improving ML components of ML-based systems. The key idea of the proposed approach is to decouple the problems of (i) estimating the expected performance improvement after retrain and (ii) estimating the impact of ML improved predictions on overall system utility. We demonstrate the application of the proposed framework by using it to self-adapt a state-of-the-art ML-based fraud-detection system, which we evaluate using a publicly-available, real fraud detection dataset. We show that by predicting system utility stemming from retraining a ML component, the probabilistic model checker can generate adaptation strategies that are significantly closer to the optimal, as compared against baselines such as periodic retraining, or reactive retraining.}, NOTE = {Presentation Video}, KEYWORDS = {Machine Learning, Self-adaptation} } @InProceedings{AAAI23::HyperJump, AUTHOR = {Mendes, Pedro and Casimiro, Maria and Romano, Paolo and Garlan, David}, TITLE = {HyperJump: Accelerating HyperBand via Risk Modelling}, YEAR = {2023}, BOOKTITLE = {Proceedings of the 37th AAAI Conference on Artificial Intelligence}, PDF = {http://acme.able.cs.cmu.edu/pubs/uploads/pdf/hyperjump_AAAI23_CR.pdf}, ABSTRACT = {In the literature on hyper-parameter tuning, a number of recent solutions rely on low-fidelity observations (e.g., training with sub-sampled datasets) in order to efficiently identify promising configurations to be then tested via high-fidelity observations (e.g., using the full dataset). Among these, HyperBand is arguably one of the most popular solutions, due to its efficiency and theoretically provable robustness. In this work, we introduce HyperJump, a new approach that builds on HyperBand’s robust search strategy and complements it with novel model-based risk analysis techniques that accelerate the search by skipping the evaluation of low risk configurations, i.e., configurations that are likely to be eventually discarded by HyperBand. We evaluate HyperJump on a suite of hyper-parameter optimization problems and show that it provides over one-order of magnitude speed-ups, both in sequential and parallel deployments, on a variety of deep-learning, kernel-based learning and neural architectural search problems when compared to HyperBand and to several state-of-the-art optimizers.}, KEYWORDS = {Machine Learning, Self-adaptation} }