@article {peternier2012CPE, title = {High Performance Execution of Service Compositions: a Multicore-aware Engine Design}, journal = {Concurrency and Computation: Practice and Experience (CPE)}, volume = {26}, year = {2014}, month = {January}, pages = {71-97}, publisher = {Wiley}, abstract = {Although modern computer hardware offers an increasing number of processing elements organized in nonuniform memory access (NUMA) architectures, prevailing middleware engines for executing business processes, workflows, and Web service compositions have not been optimized for properly exploiting the abundant processing resources of such machines. Amongst others, factors limiting performance are inefficient thread scheduling by the operating system, which can result in suboptimal use of system memory and CPU caches, and sequential code sections that cannot take advantage of multiple available cores. In this article, we study the performance of the JOpera process execution engine on recent multicore machines. We first evaluate its performance without any dedicated optimization for multicore hardware, showing that additional cores do not significantly improve performance, although the engine has a multithreaded design. Therefore, we apply optimizations on the basis of replication together with an improved, hardware-aware usage of the underlying resources such as NUMA nodes and CPU caches. Thanks to our optimizations, we achieve speedups from a factor of 2 up to a factor of 20 (depending on the target machine) when compared with a baseline execution {\textquoteleft}as is{\textquoteright}. }, keywords = {JOpera, multicores, nonuniform memory access architecture, performance optimization, service composition and execution}, doi = {10.1002/cpe.2948}, author = {Achille Peternier and Walter Binder and Cesare Pautasso and Daniele Bonetta} } @article {105, title = {Improving execution unit occupancy on SMT-based processors through hardware-aware thread scheduling}, journal = {Future Generation Computer Systems}, volume = {30}, year = {2014}, month = {January}, pages = {229 - 241}, abstract = {Modern processor architectures are increasingly complex and heterogeneous, often requiring software solutions tailored to the specific hardware characteristics of each processor model. In this article, we address this problem by targeting two processors featuring Simultaneous MultiThreading (SMT) to improve the occupancy of their internal execution units through a sustained stream of instructions coming from more than one thread. We target the AMD Bulldozer and IBM POWER7 processors as case studies for specific hardware-oriented performance optimizations that increase the variety of instructions sent to each core to maximize the occupancy of all its execution units. WorkOver, presented in this article, improves thread scheduling by increasing the performance of floating point-intensive workloads on Linux-based operating systems. WorkOver is a user-space monitoring tool that automatically identifies FPU-intensive threads and schedules them in a more efficient way without requiring any patches or modifications at the kernel level. Our measurements using standard benchmark suites show that speedups of up to 20\% can be achieved by simply allowing WorkOver to monitor applications and schedule their threads, without any modification of the workload.}, keywords = {multicores, Overseer}, issn = {0167739X}, doi = {10.1016/j.future.2013.06.015}, author = {Achille Peternier and Danilo Ansaloni and Daniele Bonetta and Cesare Pautasso and Walter Binder} } @conference {peternier2012ICPADS, title = {Hardware-aware thread scheduling: the case of asymmetric multicore processors}, booktitle = {18th International Conference on Parallel and Distributed Systems (ICPADS)}, year = {2012}, month = {December}, pages = {400-407}, address = {Singapore}, abstract = {Modern processor architectures are increasingly complex and heterogeneous, often requiring solutions tailored to the specific characteristics of each processor model. In this paper we address this problem by targeting the AMD Bulldozer processor as case study for specific hardware-oriented performance optimizations. The Bulldozer architecture features an asymmetric simultaneous multithreading implementation with shared floating point units (FPUs) and per-core arithmetic logic units (ALUs). Bulld Over, presented in this paper, improves thread scheduling by exploiting this hardware characteristic to increase performance of floating point-intensive workloads on Linux-based operating systems. Bulld Over is a user-space monitoring tool that automatically identifies FPU-intensive threads and schedules them in a more efficient way without requiring any patches or modifications at the kernel level. Our measurements using standard benchmark suites show that speedups of up to 10\% can be achieved by simply allowing Bulld Over to monitor applications, without any modification of the workload.}, doi = {http://doi.ieeecomputersociety.org/10.1109/ICPADS.2012.62}, author = {Achille Peternier and Danilo Ansaloni and Daniele Bonetta and Cesare Pautasso and Walter Binder} } @conference {bonetta2012EuroPar, title = {Node.Scala: Implicit Parallel Programming for High-Performance Web Services}, booktitle = {International European Conference on Parallel and Distributed Computing (EuroPar 2012)}, volume = {7484}, year = {2012}, month = {August}, pages = {626{\textendash}637}, publisher = {Springer}, organization = {Springer}, address = {Rhodes Island, Greece}, keywords = {Node.JS, REST, Scala, scalability, Web services}, isbn = {978-3-642-32819-0}, url = {http://sosoa.inf.unisi.ch/files/sosoa_europar2012.pdf}, author = {Daniele Bonetta and Danilo Ansaloni and Achille Peternier and Cesare Pautasso and Walter Binder} } @conference {s:2012:ppopp, title = {S: a scripting language for high-performance RESTful web services}, booktitle = {17th ACM SIGPLAN symposium on Principles and Practice of Parallel Programming (PPoPP 2012)}, year = {2012}, month = {February}, pages = {97{\textendash}106}, publisher = {ACM}, organization = {ACM}, address = {New Orleans, USA}, abstract = {There is an urgent need for novel programming abstractions to leverage the parallelism in modern multicore machines. We introduce S, a new domain-specific language targeting the server-side scripting of high-performance RESTful Web services. S promotes an innovative programming model based on explicit (control-flow) and implicit (process-level) parallelism control, allowing the service developer to specify which portions of the control-flow should be executed in parallel. For each service, the choice of the best level of parallelism is left to the runtime system. We assess performance and scalability by implementing two non-trivial composite Web services in S. Experiments show that S-based Web services can handle thousands of concurrent client requests on a modern multicore machine.}, keywords = {REST, scripting, Web service composition}, isbn = {978-1-4503-1160-1}, doi = {10.1145/2145816.2145829}, author = {Daniele Bonetta and Achille Peternier and Cesare Pautasso and Walter Binder} } @conference {overseer:2011:pppj, title = {Overseer: low-level hardware monitoring and management for Java}, booktitle = {9th International Conference on Principles and Practice of Programming in Java (PPPJ {\textquoteright}11)}, year = {2011}, pages = {143{\textendash}146}, publisher = {ACM}, organization = {ACM}, address = {New York, NY, USA}, abstract = {The high-level and portable nature of the Java platform allows applications to be written once and executed on all the supported systems. However, such a feature comes at the cost of hardware abstraction, making it more difficult or even impossible to access several low-level functionalities. Overseer is a Java framework that makes it possible on Linux systems by simplifying access to real-time measurement of low-level data such as Hardware Performance Counters (HPCs), IPMI sensors, and Java VM internal events. Overseer supports functionalities such as HPC-management, process/thread affinity settings, hardware topology identification, as well as power-consumption and temperature monitoring. In this paper we describe Overseer and how to use it to extend Java applications with functionalities not provided by the default runtime. A public release of Overseer is available.}, keywords = {hardware performance counters, Java, monitoring, Overseer}, isbn = {978-1-4503-0935-6}, doi = {http://doi.acm.org/10.1145/2093157.2093179}, author = {Achille Peternier and Daniele Bonetta and Walter Binder and Cesare Pautasso} } @inproceedings {sosoa:2011, title = {Towards Self-Organizing Service-Oriented Architectures}, year = {2011}, month = {July}, pages = {115-121}, publisher = {IEEE}, address = {Washington, DC, USA}, abstract = {Service-oriented architectures (SOAs) provide a successful model for structuring complex distributed software systems, as they reduce the cost of ownership and ease the creation of new applications by composing existing services. However, currently, the development of service-oriented applications requires many manual tasks and prevailing infrastructure is often based on centralized components that are central points of failure and easily become bottlenecks. In this paper, we promote self-organizing SOA as a new approach to overcome these limitations. Self-organizing SOA integrates research results in the areas of autonomic and service oriented computing. We consider self-organizing features for the whole life-cycle of a service-oriented application, from the creation to the execution, optimization, and monitoring.}, keywords = {autonomic computing, monitoring, self-organizing service-oriented architecture, service composition, service oriented computing, Web services}, doi = {10.1109/SERVICES.2011.44}, author = {Walter Binder and Daniele Bonetta and Cesare Pautasso and Achille Peternier and Diego Milano and Heiko Schuldt and Nenad Stojnic and Boi Faltings and Immanuel Trummer} } @conference {jopera:2010:soca, title = {Exploiting multicores to optimize business process execution}, booktitle = {International Conference on Service-Oriented Computing and Applications (SOCA 2010)}, year = {2010}, month = {December}, pages = {1-8}, publisher = {IEEE}, organization = {IEEE}, address = {Perth, Australia}, abstract = {While modern CPUs offer an increasing number of cores with shared caches, prevailing execution engines for business processes, workflows, or Web service compositions have not been optimized for properly exploiting the abundant processing resources of such CPUs. One factor limiting performance is the inefficient thread scheduling by the operating system, which can result in suboptimal use of shared caches. In this paper we study performance of the JOpera business process execution engine on a recent multicore machine. By analyzing the engine{\textquoteright}s architecture and by binding threads that are likely to access shared data to cores with a common cache, we achieve speedups up to 13\% for a variety of workloads, without modifying the engine{\textquoteright}s architecture and implementation, apart from binding threads to CPUs. As the engine is implemented in Java, we provide a new Java library to manage thread bindings and hardware performance counters. We also leverage hardware performance counters to explain the observed speedup in our performance analysis.}, keywords = {business data processing, business process execution engines, business process execution optimization, hardware performance counters, Java, JOpera, multicores, performance optimization, thread-CPU bindings, Web service composition, Web services, workflow}, doi = {10.1109/SOCA.2010.5707156}, author = {Achille Peternier and Daniele Bonetta and Cesare Pautasso and Walter Binder} } @conference {jopera:2010:apscc, title = {A Multicore-Aware Runtime Architecture for Scalable Service Composition}, booktitle = {5th Asia-Pacific Services Computing Conference (APSCC 2010)}, year = {2010}, month = {December}, pages = {83-90}, publisher = {IEEE}, organization = {IEEE}, address = {Hangzhou, China}, abstract = {Middleware for web service orchestration, such as runtime engines for executing business processes, workflows, or web service compositions, can easily become performance bottlenecks when the number of concurrent service requests increases. Many existing process execution engines have been designed to address scalability with distribution and replication techniques. However, the advent of modern multicore machines, comprising several chip multi-processors each offering multiple cores and often featuring a large shared cache, offers the opportunity to redesign the architecture of process execution engines in order to take full advantage of the underlying hardware resources. In this paper we present an innovative process execution engine architecture. Its design takes into account the specific constraints of multicore machines and scales well on different processor architectures, as shown by our extensive performance evaluation. A key feature of the design is self-configuration at startup according to the type and number of available CPUs. We show that our design makes efficient use of the available resources and can scale to run thousands of concurrent business process instances per second, highlighting the potential and the benefits for multicore-awareness in the design of scalable process execution engines.}, keywords = {middleware, multicore-aware runtime architecture, multicores, process execution engine, Web service composition, Web services}, doi = {10.1109/APSCC.2010.61}, author = {Daniele Bonetta and Achille Peternier and Cesare Pautasso and Walter Binder} } @inproceedings {1948635, title = {Towards scalable service composition on multicores}, year = {2010}, month = {October}, pages = {655{\textendash}664}, publisher = {Springer}, address = {Crete}, abstract = {The advent of modern multicore machines, comprising several chip multi-processors each offering multiple cores and often featuring a large shared cache, offers the opportunity to redesign the architecture of service composition engines in order to take full advantage of the underlying hardware resources. In this paper we introduce an innovative service composition engine architecture, which takes into account specific features of multicore machines while not being constrained to run on any particular processor architecture. Our preliminary performance evaluation results show that the system can scale to run thousands of concurrent business process instances per second.}, keywords = {JOpera, multicore, Web service composition}, isbn = {3-642-16960-0}, doi = {10.1007/978-3-642-16961-8_90}, author = {Daniele Bonetta and Achille Peternier and Cesare Pautasso and Walter Binder} }