@Article{budiu-sigmod24, author = {Mihai Budiu and Tej Chajed and Frank McSherry and Leonid Ryzhyk and Val Tannen}, title = {{DBSP}: Incremental Computation on Streams and Its Applications to Databases}, journal = {SIGMOD Research Highlights}, volume = {53}, month = {March}, year = {2024}, note = {A shorter and simpler version of the VLDB 2023 DBSP paper.}, url = {https://mihaibudiu.github.io/work/budiu-sigmod24.pdf}, confweb = {https://sigmodrecord.org/category/research-highlights/}, no = {1}, url2 = {https://sigmodrecord.org/?smd_process_download=1&download_id=13708} } @Misc{goldstein-retrospective23a, author = {Seth Copen Goldstein and Mihai Budiu}, title = {A retrospective on ``{NanoFabrics}: spatial computing using molecular electronics''}, note = {Original paper published in ISCA 2001}, url = {https://sites.coecis.cornell.edu/isca50retrospective/files/2023/06/goldstein_2001_nanofabrics.pdf}, confweb = {https://sites.coecis.cornell.edu/isca50retrospective/papers/}, howpublished = {ISCA@50 25-year Retrospective 1996-2020} } @Misc{goldstein-retrospective23b, author = {Seth Copen Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor, and Ronald Laufer}, title = {A retrospective on ``{PipeRench}: A Coprocessor for Streaming Multimedia Acceleration}, note = {Original paper published in ISCA 1999}, url = {https://sites.coecis.cornell.edu/isca50retrospective/files/2023/06/goldstein_1999_piperench.pdf}, confweb = {https://sites.coecis.cornell.edu/isca50retrospective/papers/}, howpublished = {ISCA@50 25-year Retrospective 1996-2020} } @InProceedings{budiu-vldb23, author = {Mihai Budiu and Tej Chajed and Frank McSherry and Leonid Ryzhyk and Val Tannen}, title = {{DBSP}: Automatic Incremental View Maintenance for Rich Query Languages}, booktitle = {Proceedings of the VLDB Endowment (VLDB)}, volume = {16}, number = {7}, pages = {1601-1614}, address = {Vancouver, Canada}, month = {August}, year = {2023}, abstract = {Incremental view maintenance (IVM) has long been a central problem in database theory. Many solutions have been proposed for restricted classes of database languages, such as the relational algebra, or Datalog. These techniques do not naturally generalize to richer languages. In this paper we give a general, heuristic-free solution to this problem in 3 steps: (1) we describe a simple but expressive language called DBSP for describing computations over data streams; (2) we give a new mathematical definition of IVM and a general algorithm for solving IVM for arbitrary DBSP programs, and (3) we show how to model many rich database query languages using DBSP (including the full relational algebra, queries over sets and multisets, arbitrarily nested relations, aggregation, flatmap (unnest), monotonic and non-monotonic recursion, streaming aggregation, and arbitrary compositions of all of these). SQL and Datalog can both be implemented in DBSP. As a consequence, we obtain efficient incremental view maintenance algorithms for queries written in all these languages.}, note = {Best paper award. ACM SIGMOD Research Highlights Award.}, url = {https://mihaibudiu.github.io/work/budiu-vldb23.pdf}, confweb = {https://www.vldb.org/2023}, doi = {https://doi.org/10.14778/3587136.3587137}, video = {https://1drv.ms/v/s!AlywK8G1COQ_lfR6Ry6jtRIMJRhi0g?e=bKP1Kp} } @InProceedings{rong-vldb23, author = {Kexin Rong and Mihai Budiu and Athinagoras Skiadopoulos and Lalith Suresh and Amy Tai}, title = {Scaling a Declarative Cluster Manager Architecture with Query Optimization Techniques}, booktitle = {Proceedings of the VLDB Endowment (VLDB)}, pages = {2618–2631}, address = {Vancouver, Canada}, month = {August}, year = {2023}, abstract = {Cluster managers play a crucial role in data centers by distributing workloads among infrastructure resources. Declarative Cluster Managers (DCM) is a new cluster management architecture that allows users to express placement policies declaratively using SQL-like queries. In this paper, we share our experiences scaling up this architecture from moderate-sized enterprise clusters (hundreds or thousands of nodes) to hyperscale clusters (tens of thousands of nodes) through the lens of query optimization. To do so, we first formally specify DCM's declarative language, C-SQL, which introduces new semantics such as variable columns on top of standard SQL to express constraint optimization problems. Given cluster management logic specified as C-SQL programs, we explore and adapt techniques from classic query optimization, namely incremental view maintenance and predicate pushdown, to improve the execution efficiency of the relation and constraint components of C-SQL programs. We evaluate the effectiveness of our optimizations through a case study of building Kubernetes schedulers using C-SQL. Our optimizations demonstrated an almost 3000× speed up in database latency and reduced the size of optimization problems by as much as 1/300 of the original without affecting feasibility.}, url = {https://mihaibudiu.github.io/work/rong-vldb23.pdf}, confweb = {https://www.vldb.org/2023} } @InProceedings{sur-hotnets22, author = {Debnil Sur and Ben Pfaff and Leonid Ryzhyk and Mihai Budiu}, title = {Full-Stack {SDN}}, booktitle = {ACM Workshop on Hot Topics in Networks (HotNets)}, address = {Austin, TX}, month = {November 14-15}, year = {2022}, abstract = {In the conventional approach to developing a software-defined network system, the management, control, and data planes are all separately developed. Manually written code connects the management plane's configuration to the control plane, and the control plane generates the data planes' configurations as small program fragments that scatter across the codebase. Scalability and correctness become increasing challenges as such a system develops and grows. In response, we propose a vision, called Nerpa, in which all three planes are programmed in a unified way. In Nerpa's design, a transactional database stores the management plane state. The control plane is implemented in a specialized query language that, when compiled, will automatically execute in an incremental fashion, which improves scalability. Finally, the data plane is programmed in P4. To aid correctness, all three parts are type-checked together, and automated tools generate most code for data movement between planes. We have published a prototype implementation using an open-source license. We believe that full-stack SDN can build more robust and maintainable networked systems.}, url = {https://mihaibudiu.github.io/work/sur-hotnets22.pdf}, confweb = {https://conferences.sigcomm.org/hotnets/2022/} } @article{budiu-jpc22, author = {Budiu, Mihai and Thaker, Pratiksha and Gopalan, Parikshit and Wieder, Udi and Zaharia, Matei}, title = {{Overlook}: Differentially Private Exploratory Visualization for Big Data}, journal = {Journal of Privacy and Confidentiality}, volume = {12}, number = {1}, month = {Jul.}, year = {2022}, note = {An expanded version of the conference paper from TPDP 2020.}, url = {https://journalprivacyconfidentiality.org/index.php/jpc/article/view/779}, doi = {https://doi.org/10.29012/jpc.779}, abstractnote = {Data exploration systems that provide differential privacy must manage a privacy budget that measures the amount of privacy lost across multiple queries. One effective strategy to manage the privacy budget is to compute a one-time private synopsis of the data, to which users can make an unlimited number of queries. However, existing systems using synopses are built for offline use cases, where a set of queries is known ahead of time and the system carefully optimizes a synopsis for it. The synopses that these systems build are costly to compute and may also be costly to store. We introduce Overlook, a system that enables private data exploration at interactive latencies for both data analysts and data curators. The key idea in Overlook is ``virtual synopsis'' that can be evaluated \emph{incrementally}, without extra space storage or expensive precomputation. Overlook simply executes queries using an existing engine, such as a SQL DBMS, and adds noise to their results. Because Overlook's synopses do not require costly precomputation or storage, data curators can also use Overlook to explore the impact of privacy parameters interactively. Overlook offers a rich visual query interface based on the open source Hillview system. Overlook achieves accuracy comparable to existing synopsis-based systems, while offering better performance and removing the need for extra storage.} } @Manual{P4-16-22, author = {The P4.org consortium}, title = {The P4-16 Language Specification, version 1.2.3}, month = {July}, year = {2022}, abstract = {P4 is a language for programming the data plane of network devices. This document describes the 1.2.3 version of the programming language.}, url = {https://p4.org/p4-spec/docs/P4-16-v-1.2.3.html}, confweb = {https://p4.org} } @InProceedings{sur-p422, author = {Debnil Sur and Ben Pfaff and Leonid Ryzhyk and and Mihai Budiu}, title = {{Nerpa}: Network Programming with Relational and Procedural Abstractions}, booktitle = {P4 Workshop}, address = {virtual event}, month = {May 24--26}, year = {2022}, abstract = {We introduce Nerpa, a new methodology for building programmable networks. Nerpa automates many aspects of the process of programming the network stack. To aid correctness, it ensures type safety across the management, control, and data planes. To improve scalability, an incremental control plane recomputes state in response to network configuration changes. We have published an implementation and examples.}, url = {https://mihaibudiu.github.io/work/nerpa-p422.pdf}, confweb = {https://opennetworking.org/2022-p4-workshop/} } @InProceedings{pfaff-p422, author = {Ben Pfaff and Debnil Sur and Leonid Ryzhyk and Mihai Budiu}, title = {{P4} in {Open vSwitch} with {OFP4}}, booktitle = {P4 Workshop}, address = {virtual event}, month = {May 24--26}, year = {2022}, abstract = {Software implementations of P4 available today have significant limitations. Given that, we introduce OFP4, a prototype of an implementation of P4, including P4Runtime support, that uses Open vSwitch as its back-end. OFP4 translates P4 code plus runtime entities such as table entries into OpenFlow (OF) flows, which it installs in a running Open vSwitch instance using the OpenFlow protocol. This paper describes how this translation works and provides an overview of our proof-of-concept implementation.}, url = {https://mihaibudiu.github.io/work/ofp4-p422.pdf}, confweb = {https://opennetworking.org/2022-p4-workshop/} } @misc{budiu-arxiv22, author = {Budiu, Mihai and McSherry, Frank and Ryzhyk, Leonid and Tannen, Val}, title = {{DBSP}: Automatic Incremental View Maintenance for Rich Query Languages}, publisher = {arXiv}, month = {March}, year = {2022}, abstract = {Incremental view maintenance has been for a long time a central problem in database theory. Many solutions have been proposed for restricted classes of database languages, such as the relational algebra, or Datalog. These techniques do not naturally generalize to richer languages. In this paper we give a general solution to this problem in 3 steps: (1) we describe a simple but expressive language called DBSP for describing computations over data streams; (2) we give a general algorithm for solving the incremental view maintenance problem for arbitrary DBSP programs, and (3) we show how to model many rich database query languages (including the full relational queries, grouping and aggregation, monotonic and non-monotonic recursion, and streaming aggregation) using DBSP. As a consequence, we obtain efficient incremental view maintenance techniques for all these rich languages.}, url = {https://arxiv.org/abs/2203.16684} } @Manual{P4-16-21, author = {The P4.org consortium}, title = {The P4-16 Language Specification, version 1.2.2}, month = may, year = {2021}, abstract = {P4 is a language for programming the data plane of network devices. This document describes the 1.2.2 version of the programming language.}, url = {https://p4.org/p4-spec/docs/P4-16-v-1.2.2.html}, confweb = {https://p4.org} } @InProceedings{thaker-tpdp20, author = {Pratiksha Thaker and Mihai Budiu and Parikshit Gopalan and Udi Wieder and Matei Zaharia}, title = {{Overlook}: Differentially Private Exploratory Visualization for Big Data}, booktitle = {Theory and Practice of Differential Privacy (TPDP 2020)}, address = {Orlando, FL}, month = {November 13}, year = {2020}, abstract = {Data exploration systems that provide differential privacy must manage a privacy budget that measures the amount of privacy lost across multiple queries. One effective strategy to manage the privacy budget is to compute a one-time private synopsis of the data, to which users can make an unlimited number of queries. However, existing systems using synopses are built for offline use cases, where a set of queries is known ahead of time and the system carefully optimizes a synopsis for it. The synopses that these systems build are costly to compute and may also be costly to store. We introduce Overlook, a system that enables private data exploration at interactive latencies for both data analysts and data curators. Overlook enables fast computation of private synopses using the idea of a "virtual synopsis," which represents a potentially large private synopsis implicitly using a pseudorandom function that is evaluated on demand. Overlook uses a synopsis that is simple but fast and has accuracy comparable to other state-of-the-art synopses, requiring 0.5 seconds or less to generate a histogram over 60 GB of data -- only 2.5 × slower than the equivalent public histogram. Together, these allow Overlook to provide a rich, interactive, and fast visual query interface that allows users to explore private data with minimal storage overhead -- tens of bytes -- on the server}, url = {https://mihaibudiu.github.io/work/TPDP2020.pdf}, comment = {A longer version at \url{https://arxiv.org/abs/2006.12018}, subseqently published (with significant changes) in JPC 2022.}, confweb = {https://tpdp.journalprivacyconfidentiality.org/2020/} } @Manual{P4-16-20, author = {The P4.org consortium}, title = {The P4-16 Language Specification, version 1.2.1}, month = {June}, year = {2020}, abstract = {P4 is a language for programming the data plane of network devices. This document describes the 1.2.1 version of the programming language.}, url = {https://p4lang.github.io/p4-spec/docs/P4-16-v1.2.1.html}, confweb = {https://p4.org} } @InProceedings{budiu-vldb19, author = {Mihai Budiu and Parikshit Gopalan and Lalith Suresh and Udi Wieder and Han Kruiger and Marcos K. Aguilera}, title = {{Hillview}: A trillion-cell spreadsheet for big data}, booktitle = {Proceedings of the VLDB Endowment (VLDB)}, volume = {12}, number = {11}, pages = {1442--1457}, address = {Los Angeles, CA}, month = {August}, year = {2019}, abstract = {Hillview is a distributed spreadsheet for browsing very large datasets that cannot be handled by a single machine. As a spreadsheet, Hillview provides a high degree of interactivity that permits data analysts to explore information quickly along many dimensions while switching visualizations on a whim. To provide the required responsiveness, Hillview introduces visualization sketches, or vizketches, as a simple idea to produce compact data visualizations. Vizketches combine algorithmic techniques for data summarization with computer graphics principles for efficient rendering. While simple, vizketches are effective at scaling the spreadsheet by parallelizing computation, reducing communication, providing progressive visualizations, and offering precise accuracy guarantees. Using Hillview running on eight servers, we can navigate and visualize datasets of tens of billions of rows and trillions of cells, much beyond the published capabilities of competing systems.}, note = {A longer version available as \url{https://arxiv.org/abs/1907.04827}}, url = {https://mihaibudiu.github.io/work/budiu-vldb19.pdf}, confweb = {https://vldb.org/2019/}, doi = {https://doi.org/10.14778/3342263.3342279} } @Manual{P4-16-19, author = {The P4.org consortium}, title = {The P4-16 Language Specification, version 1.2.0}, month = {October}, year = {2019}, abstract = {P4 is a language for programming the data plane of network devices. This document describes the 1.2.0 version of the programming language.}, url = {https://p4.org/p4-spec/docs/P4-16-v1.2.0.html}, confweb = {https://p4.org} } @InProceedings{ryzhyk-datalog19, author = {Leonid Ryzhyk and Mihai Budiu}, title = {Differential Datalog}, booktitle = {Datalog 2.0}, address = {Philadelphia, PA}, month = {June 4-5}, year = {2019}, abstract = {Many real-world applications based on deductive databases require incrementally updating output relations (tables) in response to changes to input relations. To make such applications easier to implement we have created Differential Datalog (DDlog), a dialect of Datalog that automates incremental computation. A DDlog programmer writes traditional, non-incremental Datalog programs. The execution model of DDlog is however fully incremental: at runtime DDlog programs receive streams of changes to the input relations (insertions or deletions) and produce streams of corresponding changes to derived relations. The DDlog compiler translates DDlog programs to Differential Dataflow (DD) programs; DD provides an incremental execution engine supporting all the relational operators, including fixed-point. The DDlog runtime automatically maintains the indexes required to efficiently compute output updates. \par The DDlog language is targeted for system builders. In consequence, the language emphasizes usability, by providing a rich type system, a powerful expression language, a module system, including string manipulation, arithmetic, and integration with C, Rust, and Java. The code is open-source, available using an MIT permissive license.}, url = {https://mihaibudiu.github.io/work/ddlog.pdf}, confweb = {https://sites.sju.edu/plw/datalog2/} } @Manual{P4-16-18, author = {The P4.org consortium}, title = {The P4-16 Language Specification, version 1.1.0}, month = {November}, year = {2018}, abstract = {P4 is a language for programming the data plane of network devices. This document describes the 1.1.0 version of the programming language.}, url = {https://p4.org/p4-spec/docs/P4-16-v1.1.0-spec.html}, confweb = {https://p4.org} } @InProceedings{tu-lpc18, author = {William Tu and Fabian Ruffy and Mihai Budiu}, title = {{P4C-XDP}: Programming the Linux Kernel Forwarding Plane Using {P4}}, booktitle = {Linux Plumber's Conference}, address = {Vancouver, Canada}, month = {November 13-15}, year = {2018}, abstract = {P4 is a domain-specific language for implementing network data-planes. The P4 abstraction allows programmers to write network protocols in a generalized fashion, without needing to know the configuration specifics of the targeted data-plane. The extended Berkely Packet Filter (eBPF) is a safe virtual machine for executing sand-boxed programs in the Linux kernel. eBPF, and its extension the eXpress Data Path (XDP), effectively serve as programmable data-planes of the kernel. P4C-XDP is a project combining the performance of XDP with the generality and usability of P4. In this document, we describe how P4 can be translated into eBPF/XDP. We review the fundamental limitations of both technologies, analyze the performance of several generated XDP programs, and discuss problems we have faced while working on this new technology.}, url = {https://mihaibudiu.github.io/work/p4c-xdp-lpc18-paper.pdf}, confweb = {https://www.linuxplumbersconf.org/event/2/} } @Article{p4-osr17, author = {Mihai Budiu and Chris Dodd}, title = {The {P4-16} Programming Language}, journal = {ACM SIGOPS Operating Systems Review}, volume = {51}, number = {1}, pages = {5--14}, month = {August}, year = {2017}, abstract = {P4 is a language for expressing how packets are processed by the data-plane of a programmable network element such as a hardware or software switch, network interface card, router or network function appliance. This document describes the most recent version of the language, P4-16, and the reference implementation of the P4-16 compiler.}, url = {https://mihaibudiu.github.io/work/p4-osr17.pdf}, confweb = {https://dl.acm.org/citation.cfm?id=3139645&picked=prox}, doi = {https://dl.acm.org/citation.cfm?doid=3139645.3139648} } @Manual{P4-16, author = {The P4.org consortium}, title = {The P4-16 Language Specification}, month = {December 16}, year = {2016}, abstract = {P4 is a language for programming the data plane of network devices. This document provides a precise definition of the P4-16 language, which is the 2016 revision of the P4 language (https://p4.org). The primary target audience for this document includes developers who want to write compilers/simulators/IDEs/debuggers for P4 programs. This document may also be useful for P4 programmers who are interested in understanding the language syntax and semantics at a deeper level.}, confweb = {https://p4.org} } @InProceedings{budiu-egpgv16, author = {Mihai Budiu and Rebecca Isaacs and Derek Murray and Gordon Plotkin and Paul Barham and Samer Al-Kiswany and Yazan Boshmaf and Qingzhou Luo and Alexandr Andoni}, title = {Interacting with Large Distributed Datasets Using Sketch}, booktitle = {Eurographics Symposium on Parallel Graphics and Visualization}, pages = {13}, address = {Groningen, Netherlands}, month = {June 6-7}, year = {2016}, abstract = {We present Sketch, a library and a distributed runtime for building interactive tools for exploring large datasets, distributed across multiple machines. We have built several sophisticated applications using this framework;in this paper we describe a billion-row spreadsheet, and a distributed-systems performance analyzer. Sketch applications allow interactive and responsive exploration of complex distributed datasets, scaling effectively to take advantage of large computational resources.}, note = {Also as University of Wisconsin-Madison Technical report TR1817}, url = {https://mihaibudiu.github.io/work/sketch.pdf}, video = {https://youtu.be/NkV5r7OzCoc} } @InProceedings{sivaraman-sigcomm16, author = {Anirudh Sivaraman and Alvin Cheung and Mihai Budiu and Changhoon Kim and Mohammad Alizadeh and Hari Balakrishnan and George Varghese and Nick McKeown and Steve Licking}, title = {Packet Transactions: High-level Programming for Line-Rate Switches}, booktitle = {ACM SIGCOMM}, address = {Florian\'opolis, Brazil}, month = {August 22-26}, year = {2016}, abstract = {Many algorithms for congestion control, scheduling, network measurement, active queue management, and traffic engineering require custom processing of packets in the data plane of a network switch. To run at line rate, these data-plane algorithms must be implemented in hardware. With today's switch hardware, algorithms cannot be changed, nor new algorithms installed, after a switch has been built. This paper shows how to program data-plane algorithms in a high-level language and compile those programs into low-level microcode that can run on emerging programmable line-rate switching chips. The key challenge is that many data-plane algorithms create and modify algorithmic state. To achieve line-rate programmability for stateful algorithms, we introduce the notion of a packet transaction: a sequential packet-processing code block that is atomic and isolated from other such code blocks. We have developed this idea in Domino, a C-like imperative language to express data-plane algorithms. We show with many examples that Domino provides a convenient way to express sophisticated data-plane algorithms, and show that these algorithms can be run at line rate with modest estimated chip-area overhead.}, url = {https://mihaibudiu.github.io/work/sivaraman-sigcomm16.pdf}, confweb = {https://conferences.sigcomm.org/sigcomm/2016/index.php} } @Misc{budiu-p4-ebpf15, author = {Mihai Budiu}, title = {Compiling {P4} to {eBPF}}, month = {September}, year = {2015} } @InProceedings{sivaraman-sosr15, author = {Anirudh Sivaraman and Changhoon Kim and Ramkumar Krishnamoorthy and Advait Dixit and Mihai Budiu}, title = {{DC}.p4: Programming the Forwarding Plane of a Data-Center Switch}, booktitle = {ACM SIGCOMM Symposium on SDN Research (SOSR)}, address = {Santa Clara, CA}, month = {June 17-18}, year = {2015}, abstract = {The P4 programming language [29, 16] has been recently proposed as a high-level language to program the forwarding plane of programmable packet processors, spanning the spectrum from software switches through FPGAs, NPUs and reconfigurable hardware switches. This paper presents a case study of using P4 to express the forwarding plane behavior of a datacenter switch, comparable in functionality to single-chip shared-memory switches found in many datacenters today. This case study allows us to understand how specific P4 constructs were useful in modeling specific datacenter switch features. We also outline additional language constructs that needed to be added to P4 to support certain features of a datacenter switch. We discuss several lessons that we learned in the process and distill these into a proposal for how P4 could evolve in the future}, url = {https://mihaibudiu.github.io/work/sivaraman-sosr15.pdf}, confweb = {https://opennetsummit.org/2015-archive/sosr/}, doi = {https://dx.doi.org/10.1145/2619239.2626324} } @TechReport{sketch-tr15, author = {Mihai Budiu and Rebecca Isaacs and Derek Murray and Gordon Plotkin and Paul Barham and Samer Al-Kiswany and Yazan Boshmaf and Qingzhou Luo and Alexandr Andoni}, title = {Interacting with Large Distributed Datasets Using {Sketch}}, institution = {University of Wisconsin-Madison}, number = {TR1817}, month = {January}, year = {2015}, abstract = {We present Sketch, a distributed software infrastructure for building interactive tools for exploring large datasets, distributed across multiple machines. We have built three sophisticated applications using this framework: a billion-row spreadsheet, a distributed log browser, and a distributed- systems performance debugging tool. Sketch applications allow interactive and responsive exploration of complex distributed datasets, scaling gracefully to large system sizes. The conflicting constraints of large-scale data and small timescales required by human interaction are difficult to satisfy simultaneously. Sketch exploits a sweet spot in this trade-off by exploiting the observation that the precision of a data view is limited by the resolution of the user?s screen. The system pushes data reduction operations to the data sources. The core Sketch abstraction provides a narrow programming interface; Sketch clients construct a distributed application by stacking modular components with identical interfaces, each providing a useful feature: network transparency, concurrency, fault-tolerance, straggler avoidance, round-trip reduction, distributed aggregation.}, url = {https://digital.library.wisc.edu/1793/70467}, video = {https://mihaibudiu.github.io/work/video-timeline.html} } @TechReport{budiu-tr14, author = {Mihai Budiu and Gordon Plotkin and Yuan Yu and Li Zhang}, title = {Unified Query Processing for {JSON} Documents and Indexes}, institution = {Microsoft Research}, number = {MSR-TR-2014-129}, month = {December}, year = {2014}, abstract = {We present JPath, a JSON database query language, and its syntax, semantics, and implementation. We introduce an indexing data structure for answering JPath queries, and provide a theory unifying query execution on data and index trees using operations on matrices with lattice-valued elements.}, url = {https://mihaibudiu.github.io/work/jpath.pdf} } @Misc{budiu-festschrift14, author = {Mihai Budiu and Gordon Plotkin}, title = {Multilinear Programming with Big Data}, month = {September}, year = {2014}, abstract = {Systems such as MapReduce have become enormously popular for processing massive data sets since they substantially simplify the task of writing many naturally parallelizable parallel programs. In this paper we identify the computations carried out by such programs as linear transformations on distributed collections. To this end we model collections as multisets with a union operation, giving rise to a commutative monoid structure. The results of the computations (e.g., ltering, reduction) also lie in such monoids, (e.g., multisets with union, or the natural numbers with addition). The computations are then modelled as linear (i.e., homomorphic) transformations between the commutative monoids. Binary computations such as join are modelled in this framework by multilinear transformations, i.e., functions of several variables, linear in each argument. We present a typed higher-order language for writing multilinear transformations; the intention is that all computations written in such a programming language are naturally parallelizable. The language provides a rich assortment of collection types, including collections whose elements are negatively or fractionally present (in general it permits modules over any given semiring). The type system segregates computations into linear and nonlinear phases, thereby enabling them to ``switch'' between different commutative monoids over the same underlying set (for example between addition and multiplication on real numbers). We use our language to derive linear versions of standard computations on collections; we also give several examples, including a linear version of MapReduce.}, url = {https://mihaibudiu.github.io/work/budiu-festschrift14.pdf}, howpublished = {Festschrift for {Luca Cardelli}}, confweb = {https://research.microsoft.com/en-us/events/lucacardellifest/} } @InProceedings{budiu-esop13, author = {Mihai Budiu and Joel Galenson and Gordon Plotkin}, title = {The Compiler Forest}, booktitle = {European Symposium on Programming (ESOP)}, volume = {LNCS 7792}, pages = {20}, address = {Rome, Italy}, month = {March 16-24}, year = {2013}, abstract = {We address the problem of writing compilers targeting \emph{complex execution environments}, such as computer clusters composed of machines with multi-core CPUs. To that end we introduce \emph{partial compilers}. These compilers can pass sub-programs to several child (partial) compilers, combining the code generated by their children to generate the final target code. \par We define a set of high-level polymorphic operations manipulating both compilers and partial compilers as first-class values. These mechanisms provide a software architecture for modular compiler construction. This allows the building of a forest of compilers, providing a structured treatment of multistage compilers.}, editors = {Matthias Felleisen and Philippa Gardner} } @article{erlingsson-tocs12, author = {\'{U}lfar Erlingsson and Marcus Peinado and Simon Peter and Mihai Budiu and Gloria Mainar-Ruiz}, title = {{Fay}: Extensible Distributed Tracing from Kernels to Clusters}, journal = {Transactions on Computer Systems (TOCS)}, volume = {30}, number = {4}, month = {November}, year = {2012}, abstract = {Fay is a flexible platform for the efficient collection, processing, and analysis of software execution traces. Fay provides dynamic tracing through use of runtime instrumentation and distributed aggregation within machines and across clusters. At the lowest level, Fay can be safely extended with new tracing primitives, including even untrusted, fully-optimized machine code, and Fay can be applied to running user-mode or kernel-mode software without compromising system stability. At the highest level, Fay provides a unified, declarative means of specifying what events to trace, as well as the aggregation, processing, and analysis of those events. \par We have implemented the Fay tracing platform for Windows and integrated it with two powerful, expressive systems for distributed programming. Our implementation is easy to use, can be applied to unmodified production systems, and provides primitives that allow the overhead of tracing to be greatly reduced, compared to previous dynamic tracing platforms. To show the generality of Fay tracing, we reimplement, in experiments, a range of tracing strategies and several custom mechanisms from existing tracing frameworks. \par Fay shows that modern techniques for high-level querying and data-parallel processing of disagreggated data streams are well suited to comprehensive monitoring of software execution in distributed systems. Revisiting a lesson from the late 1960s [Deutsch and Grant 1971], Fay also demonstrates the efficiency and extensibility benefits of using safe, staticallyverified machine code as the basis for low-level execution tracing. Finally, Fay establishes that, by automatically deriving optimized query plans and code for safe extensions, the expressiveness and performance of high-level tracing queries can equal or even surpass that of specialized monitoring tools.}, note = {An expanded version of the SOSP 2011 paper}, url = {https://mihaibudiu.github.io/work/fay-tocs12.pdf}, confweb = {https://tocs.acm.org}, doi = {https://dx.doi.org/10.1145/2382553.2382555} } @Misc{budiu-hpdc12, author = {Mihai Budiu}, title = {Putting A ``Big-Data'' Platform to Good Use: Training Kinect}, address = {Delft, Netherlands}, month = {June 20}, year = {2012}, note = {Keynote to the 21st International Symposium on High-Performance Parallel and Distributed Computing (HPDC)}, howpublished = {https://mihaibudiu.github.io/work/hpdc12.pdf}, confweb = {https://www.hpdc.org/2012} } @InProceedings{budiu-biglearn11, author = {Mihai Budiu and Jamie Shotton and Derek G. Murray and Mark Finocchio}, title = {Parallelizing the Training of the Kinect Body Parts Labeling Algorithm}, booktitle = {Big Learning: Algorithms, Systems and Tools for Learning at Scale}, address = {Sierra Nevada, Spain}, month = {December 16-17}, year = {2011}, abstract = {We present the parallelized implementation of decision forest training as used in Kinect to train the body parts classification system. We describe the practical details of dealing with large training sets and deep trees, and describe how to parallelize over multiple dimensions of the problem.}, url = {https://mihaibudiu.github.io/work/budiu-biglearn11.pdf}, confweb = {https://biglearn.org/} } @InProceedings{erlingsson-sosp11, author = {\'{U}lfar Erlingsson and Marcus Peinado and Simon Peter and Mihai Budiu}, title = {{Fay}: Extensible Distributed Tracing from Kernels to Clusters}, booktitle = {ACM Symposium on Operating Systems Principles (SOSP)}, address = {Cascais, Portugal}, month = {October 23-26}, year = {2011}, abstract = {Fay is a flexible platform for the efficient collection, processing, and analysis of software execution traces. Fay provides dynamic tracing through use of runtime instrumentation and distributed aggregation within machines and across clusters. At the lowest level, Fay can be safely extended with new tracing primitives, including even untrusted, fully-optimized machine code, and Fay can be applied to running user-mode or kernel-mode software without compromising system stability. At the highest level, Fay provides a unified, declarative means of specifying what events to trace, as well as the aggregation, processing, and analysis of those events. \par We have implemented the Fay tracing platform for Windows and integrated it with two powerful, expressive systems for distributed programming. Our implementation is easy to use, can be applied to unmodified production systems, and provides primitives that allow the overhead of tracing to be greatly reduced, compared to previous dynamic tracing platforms. To show the generality of Fay tracing, we reimplement, in experiments, a range of tracing strategies and several custom mechanisms from existing tracing frameworks. \par Fay shows that modern techniques for high-level querying and data-parallel processing of disagreggated data streams are well suited to comprehensive monitoring of software execution in distributed systems. Revisiting a lesson from the late 1960s [15], Fay also demonstrates the efficiency and extensibility benefits of using safe, statically-verified machine code as the basis for low-level execution tracing. Finally, Fay establishes that, by automatically deriving optimized query plans and code for safe extensions, the expressiveness and performance of high-level tracing queries can equal or even surpass that of specialized monitoring tools.}, url = {https://mihaibudiu.github.io/work/fay-sosp11.pdf} } @InProceedings{gonina-mapreduce11, author = {Ekaterina Gonina and Anitha Kannan and John Shafer and Mihai Budiu}, title = {Parallelizing large-scale data processing applications with data skew: a case study in product-offer matching}, booktitle = {International Workshop on MapReduce and its Applications (MAPREDUCE)}, address = {San Jose, CA}, month = {June 8}, year = {2011}, abstract = {The last decade has seen a surge of interest in large-scale data-parallel processing engines. While these engines share many features in common with parallel databases, they make a set of dierent trade-offs. In consequence many of the lessons learned for programming parallel databases have to be re-learned in the new environment. In this paper we show a case study of parallelizing an example large-scale application (offer matching, a core part of online shopping) on an example MapReduce-based distributed computation engine (DryadLINQ). We focus on the challenges raised by the nature of large data sets and data skew and show how they can be addressed eectively within this computation framework by optimizing the computation to adapt to the nature of the data. In particular we describe three different strategies for performing distributed joins and show how the platform language allows us to implement optimization strategies at the application level, without system support. We show that this extensibility in the programming model allows for a highly highly effective system, providing a measured speedup of more than 100 on 64 machines (256 cores), and an estimated speedup of 200 on 1280 machines (5120 cores) when matching 4 million offers.}, url = {https://mihaibudiu.github.io/work/gonina-mr11.pdf} } @InProceedings{jagannath-hips11, author = {Vilas Jagannath and Zuoning Yin and Mihai Budiu}, title = {Monitoring and Debugging DryadLINQ Applications with Daphne}, booktitle = {International Workshop on High-Level Parallel Programming Models and Supportive Environments (HIPS)}, address = {Anchorage, AK}, month = {May 20}, year = {2011}, abstract = {Debugging and optimizing large-scale applications is still more art than engineering discipline. This document describes our experience in building a set of tools to help DryadLINQ application developers understand and debug their programs. \par The core infrastructure for our tools is a portable library which provides a DryadLINQ job object model (i.e., a local representation of the distributed state of an executed application). Layered on the job object model we have built a variety of interactive and batch tools for: performance data collection and analysis, distributed state visualization, failure diagnostics, debugging, and profiling.}, url = {https://mihaibudiu.github.io/work/jagannath-hips11.pdf}, confweb = {https://www.unixer.de/hips2011}, video = {https://1drv.ms/i/s!AlywK8G1COQ_gZdDPb3ub6hpPDYV8g} } @InBook{mcsherry-chapter11, author = {Frank McSherry and Yuan Yu and Mihai Budiu and Michael Isard and Dennis Fetterly}, editor = {Ron Bekkerman, Misha Bilenko and John Langford}, title = {Scaling Up Machine Learning}, chapter = {Large-Scale Machine Learning using {DryadLINQ}}, publisher = {Cambridge University Press}, year = {2011}, abstract = {This chapter describes DryadLINQ, a general-purpose system for large scale data-parallel computing, and illustrates its use on a number of machine learning problems.}, url = {https://mihaibudiu.github.io/work/dryad-ml-book-chapter.pdf}, confweb = {https://www.cambridge.org/us/knowledge/isbn/item6542017/?site_locale=en_US} } @InProceedings{budiu-ipdps11, author = {Mihai Budiu and Daniel Delling and Renato Werneck}, title = {{DryadOpt}: Branch-and-Bound on Distributed Data-Parallel Execution Engines}, booktitle = {IEEE International Parallel and Distributed Processing Symposium (IPDPS)}, address = {Anchorage, AK}, month = {May 16-20}, year = {2011}, abstract = {We introduce DryadOpt, a library that enables massively parallel and distributed execution of optimization algorithms for solving hard problems. DryadOpt performs an exhaustive search of the solution space using branch-and-bound, by recursively splitting the original problem into many simpler subproblems. DryadOpt uses both parallelism (at the core level) and distributed execution (at the machine level). \par DryadOpt provides to the users a simple yet powerful interface: the user of the library only needs to implement sequential code to process individual subproblems (either by solving them in full or generating new subproblems). The parallelism and distribution are handled automatically by the DryadOpt library, and are invisible to the user. Our system is is implemented on top of DryadLINQ --- a distributed data-parallel execution engine similar to Hadoop and Map-Reduce. Despite the fact that these engines offer a constrained application model, with restricted communication patterns, our experiments show that careful design choices allow DryadOpt to scale linearly with the number of machines, with very little overhead.}, url = {https://mihaibudiu.github.io/work/ipdps11.pdf}, confweb = {https://www.ipdps.org/ipdps2011/2011_cfp.html} } @TechReport{budiu-tr10, author = {Mihai Budiu}, title = {User interfaces for exploring multi-dimensional data sets}, institution = {Microsoft Research}, number = {MSR-TR-2010-67}, month = {June}, year = {2010}, abstract = {Visualization is a crucial part of data manipulation. However, exploring interactively complex visualizations of large amounts of information is not easy. To navigate through highly dimensional data the user needs easily used tools to discover patterns and correlations. We are proposing several simple and intuitive user-interface elements based on drag-and-drop to help the user discover correlations in multi-dimensional data sets. The UI allows the user to (a) assign colors to entities according to values of their attributes, (b) drag-and-drop colors between different views displaying entity attributes and (c) drag-and-drop data selections between views. We exploit functional relationships in the underlying data model to transport colors between views in a sound way.}, url = {https://research.microsoft.com/apps/pubs/?id=132078} } @Article{abadi-tissec09, author = {Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti}, title = {Control-Flow Integrity principles, implementations and applications}, journal = {ACM Transactions on Information and System Security (TISSEC)}, volume = {13}, number = {1}, pages = {1--40}, year = {2009}, abstract = {Current software attacks often build on exploits that subvert machine-code execution. The enforcement of a basic safety property, control-flow integrity (CFI), can prevent such attacks from arbitrarily controlling program behavior. CFI enforcement is simple and its guarantees can be established formally, even with respect to powerful adversaries. Moreover, CFI enforcement is practical: It is compatible with existing software and can be done efficiently using software rewriting in commodity systems. Finally, CFI provides a useful foundation for enforcing further security policies, as we demonstrate with efficient software implementations of a protected shadow call stack and of access control for memory regions.}, url = {https://portal.acm.org/citation.cfm?id=1609960&jmp=abstract&coll=portal&dl=ACM&CFID=1071288&CFTOKEN=87398879#}, confweb = {https://www.acm.org/tissec/}, doi = {https://doi.acm.org/10.1145/1609956.1609960} } @InProceedings{goldszmidt-ladis09, author = {Moises Goldszmidt and Mihai Budiu and Yue zhang and Michael Pechuk}, title = {Towards Automatic Policy Refinement in Repair Services for Large Distributed Systems}, booktitle = {Large Scale Distributed Systems and Middleware (LADIS)}, pages = {5}, address = {Big Sky Resort, Big Sky, Montana}, month = {October 10-11}, year = {2009}, abstract = {In order to be economically feasible and to offer high levels of availability and performance, large scale distributed systems depend on the automation of repair services. While there has been considerable work on mechanisms for such automateds ervices, a framework for evaluating and optimizing the policies governing such mechanisms has been lacking. In this paper we propose one such framework and report on our initial experience in applying the framework to analyze and optimize the operation of a geo-distributed cloud storage system at Microsoft.}, note = {Also published in ACM SIGOPS Operating Systems Review vol 44 no 2, 2010, pp 47-51.}, url = {https://mihaibudiu.github.io/work/goldszmidt-ladis09.pdf}, confweb = {https://www.cs.cornell.edu/projects/ladis2009/program.htm} } @InProceedings{kannan-socc09, author = {Hari Kannan and Mihai Budiu and John D. Davis and Girish Venkataramani}, title = {Tuning {SoCs} using the Dynamic Critical Path}, booktitle = {IEEE International SOC Conference}, address = {Belfast, Northern Ireland}, month = {September 9-11}, year = {2009}, abstract = {We propose using a profiling-based technique (Dynamic Critical Path) to guide SoC optimization. Optimizing SoCs composed of many modules involves exploring a large space of possible configurations (exponential in the number of component modules). We present this optimization technique applied to a Globally Asynchronous Locally Synchronous (GALS) RTL design. Furthermore, we investigate the loss of precision when abstract versions of hardware modules are used for the critical path computation. Using the critical path provides very fast convergence towards optimal or near-optimal solutions when analyzing large configuration spaces by optimizing the design for composite optimization metrics, such as energy-delay.}, note = {Also as Microsoft Research Technical Report MSR-TR-2009-44}, url = {https://mihaibudiu.github.io/work/socc09.pdf} } @InProceedings{popa-hotcloud09, author = {Lucian Popa and Mihai Budiu and Yuan Yu and Michael Isard}, title = {{DryadInc}: Reusing work in large-scale computations}, booktitle = {Workshop on Hot Topics in Cloud Computing (HotCloud)}, address = {San Diego, CA}, month = {June 15}, year = {2009}, abstract = {Many large-scale (cloud) computations operate on append-only, partitioned datasets. We present two incremental computation frameworks to reuse prior work in these circumstances: (1) reusing identical computations already performed on data partitions, and (2) computing just on the newly appended data and merging the new and previous results.}, url = {https://mihaibudiu.github.io/work/hotcloud09.pdf}, confweb = {https://www.usenix.org/events/hotcloud09/index.html} } @TechReport{kannan-tr09, author = {Hari Kannan and Mihai Budiu and John D. Davis and Girish Venkataramani}, title = {Tuning {SoCs} using the Dynamic Critical Path}, institution = {Microsoft Research}, number = {MSR-TR-2009-44}, month = {April}, year = {2009}, abstract = {We propose using the Global Dynamic Critical Path to diagnose system-wide bottlenecks using representative benchmarks to direct embedded SoC optimizations and provide real-world experience of implementing the global critical path (GCP) analysis framework on a Globally-Asynchronous Locally-Synchronous (GALS) SoC built around the LEON3 CPU. We perform our analysis at the RTL level and extend our evaluation to abstract RTL models. We use the power-delay product as the example cost function for optimization; we can adjust the power-delay by tuning the frequency of the clock domains of each SoC IP block. We show that the GCP optimization framework can accommodate other cost functions as well, while effectively directing SoC optimization efforts. Our case studies demonstrate that the GCP algorithm can converge quickly to solutions even in the very large (exponential) search spaces describing permissible SoC configurations, with no designer intervention (for instance, we find the solution of a 6-dimensional space with 19000 configurations in 11 steps). Even though our initial implementation relies on manual source code instrumentation, we only add 1\% extra lines of code to the design. This represents annotating less than 0.2\% of the ports of the overall Multi-processor SoC design.}, url = {https://research.microsoft.com/apps/pubs/?id=80367} } @InProceedings{cretu-wasl08, author = {Gabriela F. Cre{\c t}u-Cioc{\^ a}rlie and Mihai Budiu and Moises Goldszmidt}, title = {Hunting for problems with {Artemis}}, booktitle = {USENIX Workshop on the Analysis of System Logs (WASL)}, address = {San Diego, CA}, month = {December 7}, year = {2008}, abstract = {Artemis is a modular application designed for analyzing and troubleshooting the performance of large clusters running datacenter services. Artemis is composed of four modules: (1) distributed log collection and data extraction, (2) a database storing the extracted data, (3) an interactive visualization tool for exploring the data, and (4) a plug-in interface (and a set of sample plug-ins) allowing users to implement data analysis tools including (a) the extraction and construction of new features from the basic measurements collected, and (b) the implementation and invocation of statistical and machine learning algorithms and tools. In this paper we describe each of these components and then we illustrate the power of the plug-in architecture by presenting a case-study using Artemis to analyze a Dryad application running on a 240-machine cluster.}, url = {https://mihaibudiu.github.io/work/wasl08.pdf}, confweb = {https://www.usenix.org/events/wasl08/} } @InProceedings{yu-osdi08, author = {Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and {\'U}lfar Erlingsson and Pradeep Kumar Gunda and Jon Currey}, title = {{DryadLINQ}: A System for General-Purpose Distributed Data-Parallel Computing Using a High-Level Language}, booktitle = {Symposium on Operating System Design and Implementation (OSDI)}, pages = {14}, address = {San Diego, CA}, month = {December 8-10}, year = {2008}, abstract = {DryadLINQ is a system and a set of language extensions that enable a new programming model for large scale distributed computing. It generalizes previous execution environments such as SQL, MapReduce, and Dryad in two ways: by adopting an expressive data model of strongly typed .NET objects; and by supporting general-purpose imperative and declarative operations on datasets within a traditional high-level programming language. A DryadLINQ program is a sequential program composed of LINQ expressions performing arbitrary side effect-free transformations on datasets, and can be written and debugged using standard .NET development tools. The DryadLINQ system automatically and transparently translates the data-parallel portions of the program into a distributed execution plan which is passed to the Dryad execution platform. Dryad, which has been in continuous operation for several years on production clusters made up of thousands of computers, ensures efficient, reliable execution of this plan. We describe the implementation of the DryadLINQ compiler and runtime. We evaluate DryadLINQ on a varied set of programs drawn from domains such as web-graph analysis, large-scale log mining, and machine learning. We show that excellent absolute performance can be attained --- a general-purpose sort of 10^{12} Bytes of data executes in 319 seconds on a 240-computer, 960- disk cluster --- as well as demonstrating near-linear scaling of execution time on representative applications as we vary the number of computers used for a job.}, note = {Best paper award. 2018 ACM SIGOPS Hall of Fame award. The video shows DryadLINQ used from Visual Studio.}, url = {https://mihaibudiu.github.io/work/DryadLINQ.pdf}, confweb = {https://www.usenix.org/event/osdi08/}, acceptancerate = {26/193=13\%}, video = {https://1drv.ms/i/s!AlywK8G1COQ_gZdCnMepIQuE7Qr13A} } @TechReport{yu-tr08, author = {Yuan Yu and Michael Isard and Dennis Fetterly and Mihai Budiu and Ulfar Erlingsson and Pradeep Kumar Gunda and Jon Currey and Frank McSherry and Kannan Achan}, title = {Some sample programs written in {DryadLINQ}}, institution = {Microsoft Research}, number = {MSR-TR-2008-74}, pages = {37}, month = may, year = {2008}, abstract = {DryadLINQ is a system and a set of language extensions that enable a new programming model for large scale distributed computing. This technical report contains annotated listings of several example programs written using DryadLINQ, illustrating typical usage.}, url = {https://research.microsoft.com/research/pubs/view.aspx?msr_tr_id=MSR-TR-2008-74} } @inproceedings{venkataramani-dac07, author = {Girish Venkataramani and Tiberiu Chelcea and Mihai Budiu and Seth C. Goldstein}, title = {Critical Path: A Tool for System-Level Timing Analysis}, booktitle = {Design Automation Conference (DAC)}, address = {San Diego, CA}, month = {June 4--8}, year = {2007}, abstract = {An effective method for focusing optimization effort on the most important parts of a design is to examine those elements on the critical path. Traditionally, the critical path is defined at the RTL level, as the longest path in the combinational logic between clocked registers. In this paper, we present a system-level timing analysis technique to define the concept of a Global Critical Path (GCP), for predicting system-level performance. We show how the GCP can be used as a theoretical and practical tool for understanding, summarizing and optimizing the behavior of highly concurrent self-timed circuits. We formally define the GCP and show how it can be constructed using a discrete event model and hardware profiling techniques. The GCP provides valuable insight into the control-path behavior of circuits and in finding system-level bottlenecks. We have incorporated the GCP construction and analysis framework into a high-level synthesis and simulation toolchain, thus enabling complete automation in modeling, analysis and optimization.}, note = {An expanded version is in the technical report CMU-CS-06-144}, url = {https://mihaibudiu.github.io/work/dac07.pdf}, confweb = {https://www.dac.com/About/Conference-Archive/44th-DAC-2007}, acceptancerate = {161/713 = 22\%} } @inproceedings{isard-eurosys07, author = {Michael Isard and Mihai Budiu and Yuan Yu and Andrew Birrell and Dennis Fetterly}, title = {{Dryad}: Distributed Data-Parallel Programs from Sequential Building Blocks}, booktitle = {European Conference on Computer Systems (EuroSys)}, pages = {59--72}, address = {Lisbon, Portugal}, month = {March 21-23}, year = {2007}, abstract = {Dryad is a general-purpose distributed execution engine for coarse-grain data-parallel applications. A Dryad application combines computational ``vertices'' with communication ``channels'' to form a dataflow graph. Dryad runs the application by executing the vertices of this graph on a set of available computers, communicating as appropriate through files, TCP pipes, and shared-memory FIFOs. The vertices provided by the application developer are quite simple and are usually written as sequential programs with no thread creation or locking. Concurrency arises from Dryad scheduling vertices to run simultaneously on multiple computers, or on multiple CPU cores within a computer. The application can discover the size and placement of data at run time, and modify the graph as the computation progresses to make efficient use of the available resources. Dryad is designed to scale from powerful multi-core single computers, through small clusters of computers, to data centers with thousands of computers. The Dryad execution engine handles all the difficult problems of creating a large distributed, concurrent application: scheduling the use of computers and their CPUs, recovering from communication or computer failures, and transporting data between vertices.}, note = {Also as technical report MSR-TR-2006-140. EuroSys 2017 Test of Time Award.}, url = {https://mihaibudiu.github.io/work/eurosys07.pdf} } @InProceedings{budiu-asid06, author = {Mihai Budiu and {\'U}lfar Erlingsson and Mart{\'\i}n Abadi}, title = {Architectural Support for Software-Based Protection}, booktitle = {Workshop on Architectural and System Support for Improving Software Dependability (ASID)}, pages = {42--51}, address = {San Jose, CA}, month = {October 21}, year = {2006}, abstract = {Control-Flow Integrity (CFI) is a property that guarantees program control flow cannot be subverted by a malicious adversary, even if the adversary has complete control of data memory. We have shown in prior work how CFI can be enforced by using inlined software guards that perform safety checks. The first part of this paper shows how modest Instruction Set Architecture (ISA) support can replace such guard code with single instructions. \par On the foundation of CFI we have implemented XFI: a protection system that offers fine-grained memory access control and fundamental integrity guarantees for critical system state. XFI can be seen as a flexible, generalized form of software-based fault isolation (SFI). In the second part of this paper we present ISA support for XFI, in the form of simple bounds-check instructions. \par CFI and XFI can significantly increase the security and integrity of software execution. Our results indicate that support for CFI and XFI is a straightforward, simple addition to hardware architectures. Compared to software guards, such hardware support increases the efficiency and simplicity of enforcement.}, note = {Also as technical report MSR-TR-2006-115}, url = {https://mihaibudiu.github.io/work/asid06.pdf} } @TechReport{venkataramani-tr06, author = {Girish Venkataramani and Tiberiu Chelcea and Mihai Budiu and Seth C. Goldstein}, title = {Modeling the Global Critical Path in Concurrent Systems}, institution = {Carnegie Mellon University, Computer Science Department}, number = {CMU-CS-06-144}, pages = {22}, month = {August}, year = {2006}, abstract = {We show how the global critical path can be used as a practical tool for understanding, optimizing and summarizing the behavior of highly concurrent self-timed circuits. Traditionally, critical path analysis has been applied to DAGs, and thus is constrained to combinatorial sub-circuits. We formally define the global critical path (GCP) and show how it can be constructed using only local information that is automatically derived directly from the circuit. We introduce a form of Production Rules, which can accurately determine the GCP for a given input vector even for modules which exhibit choice. \par The GCP provides valuable insight into the behavior of circuits and, more generally, into the control behavior of the application class. These insights help in formulating new optimizations and re-formulating existing ones to use the GCP knowledge. We have incorporated our framework into a high-level synthesis tool-chain, which automatically computes GCP, and utilizes the GCP for large scale circuits. We demonstrate the effectiveness of the GCP framework by re-formulating two traditional CAD optimizations to use the GCPyielding efficient algorithms which improve circuit power (by up to 9\%) and performance (by up to 60\%) in our experiments.}, url = {https://reports-archive.adm.cs.cmu.edu/anon/2006/abstracts/06-144.html} } @InProceedings{erlingsson-osdi06, author = {{\'{U}}lfar Erlingsson and Mart{\'\i}n Abadi and Michael Vrable and Mihai Budiu and George C. Necula}, title = {{XFI}: Software Guards for System Address Spaces}, booktitle = {Symposium on Operating System Design and Implementation (OSDI)}, pages = {75--88}, address = {Seattle, WA}, month = {November 6-8}, year = {2006}, abstract = {XFI is a comprehensive protection system that offers both flexible access control and fundamental integrity guarantees, at any privilege level and even for legacy code in commodity systems. For this purpose, XFI combines static analysis with inline software guards and a two-stack execution model. We have implemented XFI forWindows on the x86 architecture using binary rewriting and a simple, stand-alone verifier; the implementation�s correctness depends on the verifier, but not on the rewriter. We have applied XFI to software such as device drivers and multimedia codecs. The resulting modules function safely within both kernel and user-mode address spaces, with only modest enforcement overheads}, url = {https://mihaibudiu.github.io/work/osdi06.pdf}, acceptancerate = {27/150=18\%}, confweb = {https://www.usenix.org/events/osdi06} } @InProceedings{mishra-asplos06, author = {Mahim Mishra and Timothy J. Callahan and Tiberiu Chelcea and Girish Venkataramani and Mihai Budiu and Seth C. Goldstein}, title = {{Tartan}: Evaluating Spatial Computation For Whole Program Execution}, booktitle = {International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)}, pages = {163--174}, address = {San Jose, CA}, month = {October 21-25}, year = {2006}, abstract = {Spatial Computing (SC) has been shown to be an energy-efficient model for implementing program kernels. In this paper we explore the feasibility of using SC for more than small kernels. To this end, we evaluate the performance and energy efficiency of entire applications on Tartan, a general-purpose architecture which integrates a reconfigurable fabric (RF) with a superscalar core. Our compiler automatically partitions and compiles an application into an instruction stream for the core and a configuration for the RF. We use a detailed simulator to capture both timing and energy numbers for all parts of the system. Our results indicate that a hierarchical RF architecture, designed around a scalable interconnect, is instrumental in harnessing the benefits of spatial computation. The interconnect uses static configuration and routing at the lower levels and a packet-switched, dynamically-routed network at the top level. Tartan is most energyefficient when almost all of the application is mapped to the RF, indicating the need for the RF to support most general-purpose programming constructs. Our initial investigation reveals that such a system can provide, on average, an order of magnitude improvement in energy-delay compared to an aggressive superscalar core on single-threaded workloads.}, url = {https://mihaibudiu.github.io/work/asplos06.pdf}, acceptancerate = {38/158=24\%} } @InProceedings{budiu-ispass05, author = {Mihai Budiu and Pedro V. Artigas and Seth Copen Goldstein}, title = {Dataflow: A Complement to Superscalar}, booktitle = {IEEE International Symposium on Performance Analysis of Systems and Software (ISPASS)}, pages = {177--186}, address = {Austin, TX}, month = {March 20-22}, year = {2005}, abstract = {There has been a resurgence of interest in dataflow architectures, because of their potential for exploiting parallelism with low overhead. In this paper we analyze the performance of a class of static dataflow machines on integer media and control-intensive programs and we explain why a dataflow machine, even with unlimited resources, does not always outperform a superscalar processor on general-purpose codes, under the assumption that both machines take the same time to execute basic operations. We compare a program-specific dataflow machine with unlimited parallelism to a superscalar processor running the same program. While the dataflow machines provide very good performance on most data-parallel programs, we show that the dataflow machine cannot always take advantage of the available parallelism. Using the dynamic critical path we investigate the mechanisms used by superscalar processors to provide a performance advantage and their impact on a dataflow model.}, url = {https://www.cs.cmu.edu/~mihaib/research/ispass05.pdf}, acceptancerate = {27/92 = 29\%}, confweb = {https://www.ispass.org/ispass2005} } @InProceedings{budiu-odes05, author = {Mihai Budiu and Seth Copen Goldstein}, title = {Inter-Iteration Scalar Replacement in the Presence of Conditional Control-Flow}, booktitle = {Workshop on Optimizations for {DSP} and Embedded Systems (ODES)}, pages = {20--29}, address = {San Jose, CA}, month = {March 20}, year = {2005}, abstract = {A large class of multimedia programs for embedded systems manipulate data represented as dense matrices. In this paper we revisit the classical optimization of scalar replacement of array elements and pointer accesses; this optimization allocates array elements to registers, reducing memory traffic. We generalize the state-of-the-art algorithm, by Carr and Kennedy~\cite{carr-spe94}, improving it to handle simultaneously both conditional control-flow and inter-iteration data reuse. Our algorithm operates within the same assumptions of the classical one (perfect dependence information), and has the same limitations (increased register pressure). It is, however, optimal in the sense that within each code region where scalar promotion is applied, given sufficient registers, each memory location is read/written at most once.}, url = {https://www.cs.cmu.edu/~mihaib/research/odes05.pdf}, confweb = {https://www.ece.vill.edu/~deepu/odes/odes-3_program.html} } @inproceedings{abadi-icfem05, author = {Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti}, title = {A Theory of Secure Control-Flow}, booktitle = {International Conference on Formal Engineering Methods (ICFEM)}, pages = {111-124}, address = {Manchester, UK}, month = {November 1-4}, year = {2005}, abstract = {Control-Flow Integrity (CFI) means that the execution of a program dynamically follows only certain paths, in accordance with a static policy. CFI can prevent attacks that, by exploiting buffer overflows and other vulnerabilities, attempt to control program behavior. This paper develops the basic theory that underlies two practical techniques for CFI enforcement, with precise formulations of hypotheses and guarantees.}, url = {https://mihaibudiu.github.io/work/icfem05.pdf} } @inproceedings{abadi-ccs05, author = {Mart{\'\i}n Abadi and Mihai Budiu and {\'U}lfar Erlingsson and Jay Ligatti}, title = {Control-Flow Integrity}, booktitle = {ACM Conference on Computer and Communication Security (CCS)}, pages = {340-353}, address = {Alexandria, VA}, month = {November 7-11}, year = {2005}, abstract = {Current software attacks often build on exploits that subvert machine-code execution. The enforcement of a basic safety property, Control-Flow Integrity (CFI), can prevent such attacks from arbitrarily controlling program behavior. CFI enforcement is simple, and its guarantees can be established formally, even with respect to powerful adversaries. Moreover, CFI enforcement is practical: it is compatible with existing software and can be efficiently implemented using software rewriting in commodity systems. Finally, CFI provides a useful foundation for enforcing further security policies, such as policies that constrain the use of data memory.}, note = {CCS Test of Time Award in 2015}, url = {https://mihaibudiu.github.io/work/ccs05.pdf}, acceptancerate = {38/250=25\%}, confweb = {https://www.acm.org/sigs/sigsac/ccs/CCS2005} } @InProceedings{budiu-asplos04, author = {Mihai Budiu and Girish Venkataramani and Tiberiu Chelcea and Seth Copen Goldstein}, title = {Spatial Computation}, booktitle = {International Conference on Architectural Support for Programming Languages and Operating Systems (ASPLOS)}, pages = {14--26}, address = {Boston, MA}, month = {October 9-13}, year = {2004}, abstract = {This paper describes a computer architecture that relies on the direct translation of high-level language programs into {\em Spatial Computation} (SC) hardware structures. SC program implementations are completely distributed, without any centralized control. SC circuits are optimized for {\em wires} at the expense of computation units. \par In this paper we investigate a particular implementation SC structures called ASH (Application-Specific Hardware). Under the assumption that computation is cheaper than communication, ASH replicates computation units to simplify interconnect, building a system which uses very simple, completely dedicated communication channels. As a consequence, communication on the datapath never requires arbitration; the only arbitration required is for accessing memory. ASH relies on very simple hardware primitives, using no associative structures, no multiported register files, no scheduling logic, no broadcast, and no clocks. As a consequence, ASH hardware is fast and extremely power efficient. \par In this work we demonstrate three features of ASH: (1) that such architectures can be built by automatic compilation of C programs, (2) that distributed computation is in some respects fundamentally different from monolithic superscalar processors and (3) that ASIC implementations of ASH use 3 orders of magnitude less energy compared to high-end superscalar processors, while being within a factor of two in performance.}, url = {https://www.cs.cmu.edu/~mihaib/research/asplos04.pdf}, acceptancerate = {24/169 = 14\%}, doi = {https://doi.acm.org/10.1145/1024393.1024396} } @InProceedings{koes-msp04, author = {David Koes and Mihai Budiu and Girish Venkataramani and Seth Copen Goldstein}, title = {Programmer Specified Pointer Independence}, booktitle = {Workshop on Memory System Performance (MSP)}, month = {June}, year = {2004}, abstract = {Good alias analysis is essential in order to achieve high performance on modern processors, yet interprocedural analysis does not scale well. We present a source code annotation, {\tt \#pragma independent}, which is a more flexible, intuitive and useful way for the programmer to provide pointer aliasing information than the current C99 {\tt restrict} keyword. We describe a tool which highlights the most important and most likely correct locations at which a programmer can insert the pragmas. We show that such annotations can be used effectively in compilers to achieve speedups of up to 1.2x.}, note = {Also as technical report CMU-CS-03-123}, url = {https://www.cs.cmu.edu/~mihaib/research/msp04.pdf}, confweb = {https://cs.anu.edu.au/~Steve.Blackburn/msp2004} } @InProceedings{venkataramani-iwls04, author = {Girish Venkataramani and Mihai Budiu and Seth Copen Goldstein}, title = {{C} to Asynchronous Dataflow Circuits: An End-to-End Toolflow}, booktitle = {International Workshop on Logic synthesis (IWLS)}, pages = {501--508}, address = {Temecula, CA}, month = {June}, year = {2004}, abstract = {We present a complete toolflow that translates ANSI-C programs into asynchronous circuits. The toolflow is built around a compiler that converts C into a functional dataflow intermediate representation, exposing instruction-level, pipeline and memory parallelism. The compiler performs optimizations and converts the intermediate representation into pipelined asynchronous circuits, with no centralized controllers. In the resulting circuits, control is distributed, communication is achieved through local wires, and arbitration for datapath resources is unnecessary. Circuits automatically synthesized from Mediabench kernels exhibit substantially better energy-delay than either single-issue processors or aggressive superscalar cores.}, note = {(full paper)}, url = {https://www.cs.cmu.edu/~mihaib/research/iwls04.pdf}, confweb = {https://www.iwls.org/} } @InProceedings{budiu-cgo03, author = {Mihai Budiu and Seth Copen Goldstein}, title = {Optimizing Memory Accesses For Spatial Computation}, booktitle = {International ACM/IEEE Symposium on Code Generation and Optimization (CGO)}, pages = {216-227}, address = {San Francisco, CA}, month = {March 23--26}, year = {2003}, abstract = {In this paper we present the internal representation and optimizations used by the CASH compiler for improving the memory parallelism of pointer-based programs. CASH uses an SSA-based representation for memory, which compactly summarizes both control-flow and dependence information. In CASH, memory optimization is a four-step process: (1) first an initial, relatively coarse, representation of memory dependences is built; (2) next, unnecessary memory dependences are removed using dependence tests; (3) third, redundant memory operations are removed (4) finally, parallelism is increased by pipelining memory accesses in loops. While the first three steps above are very general, the loop pipelining transformations are particularly applicable for spatial computation, which is the primary target of CASH. The redundant memory removal optimizations presented are: load/store hoisting (subsuming partial redundancy elimination and common-subexpression elimination), load-after-store removal, store-before-store removal (dead store removal) and loop-invariant load motion. One of our loop pipelining transformations is a new form of loop parallelization, called loop decoupling. This transformation separates independent memory accesses within a loop body into several independent loops, which are allowed dynamically to slip with respect to each other. A new computational primitive, a token generator is used to dynamically control the amount of slip, allowing maximum freedom, while guaranteeing that no memory dependences are violated.}, url = {https://www.cs.cmu.edu/~mihaib/research/cgo03.pdf}, confweb = {https://www.cgo.org/cgo2003}, url2 = {https://csdl.computer.org/comp/proceedings/cgo/2003/1913/00/19130216abs.htm} } @InProceedings{goldstein-asap03, author = {Seth Goldstein and Mihai Budiu and Mahim Mishra and Girish Venkataramani}, title = {Reconfigurable Computing and Electronic Nanotechnology}, booktitle = {IEEE International Conference on Application-specific Systems, Architectures and Processors}, pages = {132--143}, address = {Hague, the Netherlands}, month = {June 24-26}, year = {2003}, abstract = {In this paper we examine the opportunities brought about by recent progress in electronic nanotechnology and describe the methods needed to harness them for building a new computer architecture. In this process we decompose some traditional abstractions, such as the transistor, into fine-grain pieces, such as signal restoration and input-output isolation. We also show how we can forgo the extreme reliability of CMOS circuits for low-cost chemical self-assembly at the expense of large manufacturing defect densities. We discuss advanced testing methods which can be used to recover perfect functionality from unreliable parts. We proceed to show how the molecular switch, the regularity of the circuits created by self-assembly and the high defect densities logically require the use of reconfigurable hardware as a basic building block for hardware design. We then capitalize on the convergence of compilation and hardware synthesis (which takes place when programming reconfigurable hardware) to propose the complete elimination of the instruction-set architecture from the system architecture, and the synthesis of asynchronous dataflow machines directly from high-level programming languages, such as C. We discuss in some detail a scalable compilation system that perform this task.}, note = {Invited paper}, url = {https://www.cs.cmu.edu/~mihaib/research/asap03.pdf}, confweb = {https://www-ece.rice.edu/asap2003/program03.html}, url2 = {https://csdl.computer.org/comp/proceedings/asap/2003/1992/00/19920132abs.htm} } @PhdThesis{budiu-phd03, key = {PhD Thesis 03}, author = {Mihai Budiu}, title = {Spatial Computation}, school = {Carnegie Mellon University, Computer Science Department}, number = {CMU-CS-03-217}, pages = {225}, month = {December}, year = {2003}, abstract = {This thesis presents a compilation framework for translating ANSI C programs into hardware dataflow machines. The framework is embodied in the CASH compiler, a Compiler for Application-Specific Hardware. CASH generates asynchronous hardware circuits that directly implement the functionality of the source program, without using any interpretative structures. This style of computation is dubbed ``Spatial Computation''. CASH relies extensively on predication and speculation for building efficient hardware circuits. \par The first part of this document describes Pegasus, the internal representation of CASH, and a series of novel program transformations performed by CASH, the most notable of which are a new optimal register-promotion algorithm and partial redundancy elimination for memory accesses based on predicate manipulation. \par The second part of this document evaluates the performance of the generated circuits using simulation. Using media processing benchmarks, we show that for the domain of embedded computation, the circuits generated by CASH can sustain high levels of instruction level parallelism, due to the effective use of dataflow software pipelining. A comparison of Spatial Computation and superscalar processors highlights some of the weaknesses of our model of computation, such as the lack of branch prediction and register renaming. \par The results presented in this document can be applied in several domains: (1) most of the compiler optimizations are applicable to traditional compilers for high-level languages (2) CASH itself can be used as a hardware synthesis tool for very fast system-on-a-chip prototyping directly from C sources, (3) the compilation framework we describe can be applied to the translation of imperative languages to dataflow machines, (4) we have extended the dataflow machine model to encompass predication, data-speculation and control-speculation, and (5) the tool-chain described and some specific optimizations, such as lenient execution, and pipeline balancing, can be used for synthesis and optimization of asynchronous hardware.}, note = {Technical report CMU-CS-03-217}, url = {https://www.cs.cmu.edu/~mihaib/research/thesis.pdf}, url2 = {https://reports-archive.adm.cs.cmu.edu/anon/2003/abstracts/03-217.html} } @InBook{goldstein-chapter03, key = {Chapter 03}, author = {Seth Copen Goldstein and Mihai Budiu}, editor = {Mark A. Reed and Takhee Lee}, title = {Molecules, Gates, Circuits, Computers}, chapter = {in Molecular Nanoelectronics}, pages = {327--388}, publisher = {American Scientific Publishers}, month = {January}, year = {2003}, url = {https://aspbs.com/molecularnano.html} } @InProceedings{budiu-fpl02, author = {Mihai Budiu and Seth Copen Goldstein}, title = {Compiling Application-Specific Hardware}, booktitle = {International Conference on Field Programmable Logic and Applications (FPL)}, pages = {853--863}, address = {Montpellier (La Grande-Motte), France}, month = {September 2--4}, year = {2002}, abstract = {In this paper we describe ASH, an architectural framework for implementing Application-Specific Hardware. ASH is based on automatic hardware synthesis from high-level languages. The generated circuits use only localized computation structures; in consequence, we expect these circuits to be fast, to use little power and to scale well with program complexity. \par We present in detail CASH, a scalable compiler framework for ASH, which generates hardware from programs written in C. Our compiler exploits instruction level parallelism by using aggressive speculation and dynamic scheduling. Based on this compilation scheme, we evaluate the computational resources necessary for implementing complex integer-based programs, and we suggest architectural features that would support the ASH framework.}, url = {https://www.cs.cmu.edu/~mihaib/research/fpl02-budiu.pdf}, confweb = {https://www.lirmm.fr/fpl2002/home.html}, url2 = {https://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=853} } @InProceedings{venkataramani-fpl02, key = {FPL 02a}, author = {Girish Venkataramani and Suraj Sudhir and Mihai Budiu and Seth Copen Goldstein}, title = {Factors Influencing the Performance of a CPU-RFU Hybrid Architecture}, booktitle = {International Conference on Field Programmable Logic and Applications (FPL)}, pages = {955--965}, address = {Montpellier (La Grande-Motte), France}, month = {September}, year = {2002}, abstract = {Closely coupling a reconfigurable fabric with a conventional processor has been shown to successfully improve the system performance. However, today s superscalar pro-cessors are both complex and adept at extracting Instruction Level Parallelism (ILP), which introduces many complex issues to the design of a hybrid CPU-RFU system. This paper examines the design of a superscalar processor augmented with a closely-coupled recon-figurable fabric. It identifies architectural and compiler issues that affect the performance of the overall system. Previous efforts at combining a processor core with a reconfigurable fabric are examined in the light of these issues. We also present simulation results that emphasize the impact of these factors.}, url = {https://www.cs.cmu.edu/~mihaib/research/fpl02-girish.pdf}, url2 = {https://springerlink.metapress.com/openurl.asp?genre=article&issn=0302-9743&volume=2438&spage=955}, confweb = {https://www.lirmm.fr/fpl2002/home.html} } @InProceedings{budiu-fccm02, author = {Mihai Budiu and Mahim Mishra and Ashwin Bharambe and Seth Copen Goldstein}, title = {Peer-to-peer Hardware-Software Interfaces for Reconfigurable Fabrics}, booktitle = {IEEE Symposium on Field-Programmable Custom Computing Machines (FCCM)}, pages = {57--66}, address = {Napa Valley, CA}, month = {April}, year = {2002}, abstract = {In this paper we describe a peer-to-peer interface between processor cores and reconfigurable fabrics. The main advantage of the peer-to-peer model is that it greatly expands the scope of application for reconfigurable computing and hence its potential benefits. The primary extension in our model is that ``code'' on the reconfigurable hardware unit is allowed to invoke routines both on the reconfigurable unit itself and on the fixed logic processor. We describe the software constructs and compilation mechanisms needed for such an architecture, including a detailed description of the interface between the two parts of the application.}, url = {https://www.cs.cmu.edu/~mihaib/research/fccm02-peer.pdf}, url2 = {https://csdl.computer.org/comp/proceedings/fccm/2002/1801/00/18010057abs.htm}, confweb = {https://www.fccm.org/past/2017/pastFCCMs/preprog2002.html} } @InProceedings{goldstein-isca01, author = {Seth Copen Goldstein and Mihai Budiu}, title = {{NanoFabrics}: Spatial Computing Using Molecular Electronics}, booktitle = {International Symposium on Computer Architecture (ISCA)}, pages = {178--189}, address = {G\"{o}teborg, Sweden}, year = {2001}, abstract = {The continuation of the remarkable exponential increases in processing power over the recent past faces imminent challenges due in part to the physics of deep-submicron CMOS devices and the costs of both chip masks and future fabrication plants. A promising solution to these problems is offered by an alternative to CMOS-based computing, chemically assembled electronic nanotechnology (CAEN). In this paper we outline how CAEN based computing can become a reality. We briefly describe recent work in CAEN and how CAEN will affect computer architecture. We show how the inherently reconfigurable natures of CAEN devices can be exploited to provide high-density chips with defect tolerance which will significantly reduce the cost of manufacturing. After developing the basic building blocks of a CAEN based computing devices we present some preliminary results which indicate that CAEN based computing devices can meet or exceed the performance of CMOS based devices.}, note = {Selected for inclusion in ISCA@50 25-year Retrospective 1996-2020.}, url = {https://www.cs.cmu.edu/~seth/papers/goldstein-isca01.pdf}, confweb = {https://www.ce.chalmers.se/conf2001}, doi = {https://doi.acm.org/10.1145/379240.379262} } @InProceedings{budiu-europar00, key = {EuroPar 00}, author = {Mihai Budiu and Majd Sakr and Kip Walker and Seth Copen Goldstein}, title = {{BitValue} Inference: Detecting and Exploiting Narrow Bitwidth Computations}, booktitle = {European Conference on Parallel Processing (EUROPAR)}, series = {Lecture Notes in Computer Science}, volume = {1900}, pages = {969--979}, publisher = {Springer Verlag}, address = {M\"{u}nich, Germany}, year = {2000}, abstract = {We present a compiler algorithm called BitValue, which can discover both unused and constant bits in dusty-deck C programs. BitValue uses forward and backward dataflow analyses, generalizing constant-folding and dead-code detection at the bit-level. This algorithm enables compiler optimizations which target special processor architectures for computing on non-standard bitwidths. Using this algorithm we show that up to 31\% of the computed bytes are thrown away (for programs from SpecINT95 and Mediabench). A compiler for reconfigurable hardware uses this algorithm to achieve substantial reductions (up to 20-fold) in the size of the synthesized circuits.}, note = {An expanded version is in technical report CMU-CS-00-141}, url = {https://www.cs.cmu.edu/~mihaib/research/europar00.pdf}, url2 = {https://link.springer.de/link/service/series/0558/papers/1900/19000969.pdf} } @Article{goldstein-ieee00, key = {Computer 00}, author = {Seth Copen Goldstein and Herman Schmit and Mihai Budiu and Srihari Cadambi and Matt Moe and Reed Taylor}, title = {{PipeRench}: A Reconfigurable Architecture and Compiler}, journal = {IEEE Computer}, volume = {33}, number = {4}, pages = {70--77}, month = {April}, year = {2000}, abstract = {With the proliferation of highly specialized embedded computer systems has come a diversification of workloads for computing devices. General-purpose processors are struggling to efficiently meet these applications' disparate needs, and custom hardware is rarely feasible. According to the authors, reconfigurable computing, which combines the flexibility of general-purpose processors with the efficiency of custom hardware, can provide the alternative. PipeRench and its associated compiler comprise the authors' new architecture for reconfigurable computing. Combined with a traditional digital signal processor, microcontroller or general-purpose processor, PipeRench can support a system's various computing needs without requiring custom hardware. The authors describe the PipeRench architecture and how it solves some of the pre-existing problems with FPGA architectures, such as logic granularity, configuration time, forward compatibility, hard constraints and compilation time.}, note = {2014 ACM SIGARCH/IEEE-CS TCCA Influential ISCA Paper Award in 2014}, url = {https://www.cs.cmu.edu/~mihaib/research/computer.pdf}, url2 = {https://ieeexplore.ieee.org/iel5/2/18132/00839324.pdf} } @Article{birman-tocs99, author = {Kenneth P. Birman and Mark Hayden and Oznur Oskasap and Zhen Xiao and Mihai Budiu and Yaron Minsky}, title = {Bimodal Multicast}, journal = {Transactions on Computer Systems (TOCS)}, volume = {17}, number = {2}, pages = {41--88}, month = may, year = {1999}, abstract = {There are many methods for making a multicast protocol ``reliable''. At one end of thespectrum, a reliable multicast protocol might offer atomicity guarantees, such as all-ornothing delivery, delivery ordering, and perhaps additional properties such as virtuallysynchronous addressing. At the other are protocols that use local repair to overcome transient packet loss in the network, offering ``best effort'' reliability. Yet none of this priorwork has treated stability of multicast delivery as a basic reliability property, such as might be needed in an internet radio, TV, or conferencing application. This paper looks atreliability with a new goal: development of a multicast protocol which is reliable in a sense that can be rigorously quantified and includes throughput stability guarantees. We characterize this new protocol as a ``bimodal multicast'' in reference to its reliability model, which corresponds to a family of bimodal probability distributions. Here, we introduce theprotocol, provide a theoretical analysis of its behavior, review experimental results, and discuss some candidate applications. These confirm that bimodal multicast is reliable,scalable, and that the protocol provides remarkably stable delivery throughput.}, url = {https://www.cs.cmu.edu/~mihaib/research/pbcast.pdf}, doi = {https://doi.acm.org/10.1145/312203.312207}, url2 = {https://www.acm.org/pubs/citations/journals/tocs/1999-17-2/p41-birman} } @InProceedings{goldstein-isca99, author = {Seth Copen Goldstein and Herman Schmit and Matthew Moe and Mihai Budiu and Srihari Cadambi and R. Reed Taylor and Ronald Laufer}, title = {{PipeRench}: a Coprocessor for Streaming Multimedia Acceleration}, booktitle = {International Symposium on Computer Architecture (ISCA)}, pages = {28--39}, address = {Atlanta, GA}, year = {1999}, abstract = {Future computing workloads will emphasize an architecture's ability to perform relatively simple calculations on massive quantities of mixed-width data. This paper describes a novel reconfigurable fabric architecture, PipeRench, optimized to accelerate these types of computations. PipeRench enables fast, robust compilers, supports forward compatibility, and virtualizes configurations, thus removing the fixed size constraint present in other fabrics. For the first time we explore how the bit-width of processing elements affects performance and show how the PipeRench architecture has been optimized to balance the needs of the compiler against the realities of silicon. Finally, we demonstrate extreme performance speedup on certain computing kernels (up to 190x versus a modern RISC processor), and analyze how this acceleration translates to application speedup.}, note = {2014 ACM SIGARCH/IEEE-CS TCCA Influential ISCA Paper Award. Also Selected for inclusion in ISCA@50 25-year Retrospective 1996-2020.}, url = {https://www.cs.cmu.edu/~mihaib/research/isca99.pdf}, doi = {https://doi.acm.org/10.1145/300979.300982}, confweb = {https://www.informatik.uni-trier.de/~ley/db/conf/isca/isca99.html} } @InProceedings{budiu-fpga99, author = {Mihai Budiu and Seth Copen Goldstein}, title = {Fast Compilation for Pipelined Reconfigurable Fabrics}, booktitle = {ACM/SIGDA International Symposium on Field Programmable Gate Arrays (FPGA)}, pages = {195--205}, address = {Monterey, CA}, year = {1999}, abstract = {In this paper we describe a compiler which quickly synthesizes high quality pipelined datapaths for pipelined reconfigurable devices. The compiler uses the same internal representation to perform synthesis, module generation, optimization, and place and route. The core of the compiler is a linear time place and route algorithm more than two orders of magnitude faster than traditional CAD tools. The key behind our approach is that we never backtrack, rip-up, or re-route. Instead, the graph representing the computation is preprocessed to guarantee routability by inserting lazy noops. The preprocessing steps provides enough information to make a greedy strategy feasible. The compilation speed is approximately 3000 bit-operations/second (on a PII/400Mhz) for a wide range of applications. The hardware utilization averages 60\% on the target device, PipeRench.}, url = {https://www.cs.cmu.edu/~mihaib/research/fpga99.pdf}, doi = {https://doi.acm.org/10.1145/296399.296459}, confweb = {https://portal.acm.org/toc.cfm?id=296399&dl=portal&dl=ACM&type=proceeding&idx=SERIES100&part=Proceedings&WantType=Proceedings&title=International%20Symposium%20on%20Field%20Programmable%20Gate%20Arrays} }