publications.bib

%%%%%%%%%% Under review %%%%%%%%%%
@misc{khoram2019toco,
    title={{TOCO: A Framework for Compressing Neural Network Models Based on Tolerance Analysis}},
    author={Soroosh Khoram and Jing Li},
    abstract={Neural network compression methods have enabled deploying large models on emerging edge devices with little cost, by adapting already-trained models to the constraints of these devices. The rapid development of AI-capable edge devices with limited computation and storage requires streamlined methodologies that can efficiently satisfy the constraints of different devices. In contrast, existing methods often rely on heuristic and manual adjustments to maintain accuracy, support only coarse compression policies, or target specific device constraints that limit their applicability. We address these limitations by proposing the TOlerance-based COmpression (TOCO) framework. TOCO uses an in-depth analysis of the model, to maintain the accuracy, in an active learning system. The results of the analysis are tolerances that can be used to perform compression in a fine-grained manner. Finally, by decoupling compression from the tolerance analysis, TOCO allows flexibility to changes in the hardware.},
    year={2019},
    date={2019-12-18},
    eprint={1912.08792},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    pubstate={preprint},
    url={https://arxiv.org/abs/1912.08792},
    keywords={whitepaper}
}


@misc{khoram2019interleaved,
    title={{Interleaved Composite Quantization for High-Dimensional Similarity Search}},
    author={Soroosh Khoram and Stephen J Wright and Jing Li},
    abstract={Similarity search retrieves the nearest neighbors of a query vector from a dataset of high-dimensional vectors. As the size of the dataset grows, the cost of performing the distance computations needed to implement a query can become prohibitive. A method often used to reduce this computational cost is quantization of the vector space and location-based encoding of the dataset vectors. These encodings can be used during query processing to find approximate nearest neighbors of the query point quickly. Search speed can be improved by using shorter codes, but shorter codes have higher quantization error, leading to degraded precision. In this work, we propose the Interleaved Composite Quantization (ICQ) which achieves fast similarity search without using shorter codes. In ICQ, a small subset of the code is used to approximate the distances, with complete codes being used only when necessary. Our method effectively reduces both code length and quantization error. Furthermore, ICQ is compatible with several recently proposed techniques for reducing quantization error and can be used in conjunction with these other techniques to improve results. We confirm these claims and show strong empirical performance of ICQ using several synthetic and real-word datasets.},
    year={2019},
    date={2019-12-18},
    eprint={1912.08756},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    pubstate={preprint},
    url={https://arxiv.org/abs/1912.08756},
    keywords={whitepaper}
}


@misc{alex2019mlsys,
    title={{MLSys: The New Frontier of Machine Learning Systems}},
    author={Alexander Ratner and Dan Alistarh and Gustavo Alonso and David G. Andersen and Peter Bailis and Sarah Bird and Nicholas Carlini and Bryan Catanzaro and Jennifer Chayes and Eric Chung and Bill Dally and Jeff Dean and Inderjit S. Dhillon and Alexandros Dimakis and Pradeep Dubey and Charles Elkan and Grigori Fursin and Gregory R. Ganger and Lise Getoor and Phillip B. Gibbons and Garth A. Gibson and Joseph E. Gonzalez and Justin Gottschlich and Song Han and Kim Hazelwood and Furong Huang and Martin Jaggi and Kevin Jamieson and Michael I. Jordan and Gauri Joshi and Rania Khalaf and Jason Knight and Jakub Konečný and Tim Kraska and Arun Kumar and Anastasios Kyrillidis and Aparna Lakshmiratan and Jing Li and Samuel Madden and H. Brendan McMahan and Erik Meijer and Ioannis Mitliagkas and Rajat Monga and Derek Murray and Kunle Olukotun and Dimitris Papailiopoulos and Gennady Pekhimenko and Theodoros Rekatsinas and Afshin Rostamizadeh and Christopher Ré and Christopher De Sa and Hanie Sedghi and Siddhartha Sen and Virginia Smith and Alex Smola and Dawn Song and Evan Sparks and Ion Stoica and Vivienne Sze and Madeleine Udell and Joaquin Vanschoren and Shivaram Venkataraman and Rashmi Vinayak and Markus Weimer and Andrew Gordon Wilson and Eric Xing and Matei Zaharia and Ce Zhang and Ameet Talwalkar},
    year={2019},
    eprint={1904.03257},
    archivePrefix={arXiv},
    primaryClass={cs.LG},
    pubstate={preprint},
    keywords={whitepaper}
}

@article{ratner2019sysml,
  author    = {Ratner, Alexander and Alistarh, Dan and Alonso, Gustavo and Andersen, David G. and Bailis, Peter and Bird, Sarah and Carlini, Nicholas and Catanzaro, Bryan and Chayes, Jennifer and Chung, Eric and Dally, Bill and Dean, Jeff and Dhillon, Inderjit S. and Dimakis, Alexandros and Dubey, Pradeep and Elkan, Charles and Fursin, Grigori and Ganger, Gregory R. and Getoor, Lise and Gibbons, Phillip B. and Gibson, Garth A. and Gonzalez, Joseph E. and Gottschlich, Justin and Han, Song and Hazelwood, Kim and Huang, Furong and Jaggi, Martin and Jamieson, Kevin and Jordan, Michael I. and Joshi, Gauri and Khalaf, Rania and Knight, Jason and Kone{\v{c}}n{\'{y}}, Jakub and Kraska, Tim and Kumar, Arun and Kyrillidis, Anastasios and Lakshmiratan, Aparna and Li, Jing and Madden, Samuel and McMahan, H. Brendan and Meijer, Erik and Mitliagkas, Ioannis and Monga, Rajat and Murray, Derek and Olukotun, Kunle and Papailiopoulos, Dimitris and Pekhimenko, Gennady and Rekatsinas, Theodoros and Rostamizadeh, Afshin and R{\'{e}}, Christopher and {De Sa}, Christopher and Sedghi, Hanie and Sen, Siddhartha and Smith, Virginia and Smola, Alex and Song, Dawn and Sparks, Evan and Stoica, Ion and Sze, Vivienne and Udell, Madeleine and Vanschoren, Joaquin and Venkataraman, Shivaram and Vinayak, Rashmi and Weimer, Markus and Wilson, Andrew Gordon and Xing, Eric and Zaharia, Matei and Zhang, Ce and Talwalkar, Ameet},
  title     = {SysML: The New Frontier of Machine Learning Systems},
  journal   = {arXiv preprint arXiv:1904.03257},
  volume    = {abs/1904.03257},
  year      = {2019},
  month = {mar},
  date = {2019-03-29},
  url       = {http://arxiv.org/abs/1904.03257},
  archivePrefix = {arXiv},
  eprint    = {1904.03257},
  abstract = {Machine learning (ML) techniques are enjoying rapidly increasing adoption. However, designing and implementing the systems that support ML models in real-world deployments remains a significant obstacle, in large part due to the radically different development and deployment profile of modern ML methods, and the range of practical concerns that come with broader adoption. We propose to foster a new systems machine learning research community at the intersection of the traditional systems and ML communities, focused on topics such as hardware systems for ML, software systems for ML, and ML optimized for metrics beyond predictive accuracy. To do this, we describe a new conference, SysML, that explicitly targets research at the intersection of systems and machine learning with a program committee split evenly between experts in systems and ML, and an explicit focus on topics at the intersection of the two.},
  keywords = {subpaper},
  note = {preprint},
}

%%%%%%%%%% Referred Journal %%%%%%%%%%
@ARTICLE{zha2020jssc, 
author={Zha, Yue and Nowak, Etienne and Li, Jing}, 
journal={IEEE Journal of Solid-State Circuits (**JSSC**)}, 
title={{Liquid Silicon}: A Nonvolatile Fully Programmable Processing-In-Memory Processor with Monolithically Integrated {ReRAM} for {Big Data/Machine Learning} Applications (**invited**)}, 
 year = {2020},
 pubstate = {in print},
 keywords = {journal}
}


@article{shukla2018frontiers,
title = {Computing Generalized Matrix Inverse on Spiking Neural Substrate},
author = {Rohit Shukla and Soroosh Khoram and Erik Jorgensen and Jing Li and Mikko Lipasti and Stephen Wright},
year = {2018},
journal = {Frontiers in neuroscience: Neuromorphic engineering},
volume={12},
pages={115},
year={2018},
month={Feb},
date={2018-02-14},
doi={10.3389/fnins.2018.00115},
abstract={Emerging neural hardware substrates, such as IBM's TrueNorth Neurosynaptic System, can provide an appealing platform for deploying numerical algorithms. For example, a recurrent Hopfield neural network can be used to find the Moore-Penrose generalized inverse of a matrix, thus enabling a broad class of linear optimizations to be solved efficiently, at low energy cost. However, deploying numerical algorithms on hardware platforms that severely limit the range and precision of representation for numeric quantities can be quite challenging. This paper discusses these challenges and proposes a rigorous mathematical framework for reasoning about range and precision on such substrates. The paper derives techniques for normalizing inputs and properly quantizing synaptic weights originating from arbitrary systems of linear equations, so that solvers for those systems can be implemented in a provably correct manner on hardware-constrained neural substrates. The analytical model is empirically validated on the IBM TrueNorth platform, and results show that the guarantees provided by the framework for range and precision hold under experimental conditions. Experiments with optical flow demonstrate the energy benefits of deploying a reduced-precision and energy-efficient generalized matrix inverse engine on the IBM TrueNorth platform, reflecting 10× to 100× improvement over FPGA and ARM core baselines.},
ISSN={1662-453X}, 
keywords={journal}
}

@article{zha2018JOLPE,
title = {Specialization: A New Path towards Low Power (INVITED)},
author = {Yue Zha and Jing Li},
year = {2018},
date = {2018-02-15},
journal = {ASP Journal of Low Power Electronics, 2018},
volume = {14},
number = {2},
tppubtype = {article},
doi={10.1166/jolpe.2018.1559},
keywords={journal}
}

@ARTICLE{khoram2018CAL, 
author={Khoram, Soroosh and Zha, Yue and Li, Jing}, 
journal={IEEE Computer Architecture Letters}, 
title={An Alternative Analytical Approach to Associative Processing (Best of CAL)}, 
year={2018}, 
month={July},
date={2018-01-03},
volume={17}, 
number={2}, 
pages={113-116}, 
abstract={Associative Processing (AP) is a promising alternative to the Von Neumann model as it addresses the memory wall problem through its inherent in-memory computations. However, because of the countless design parameter choices, comparisons between implementations of two so radically different models are challenging for simulation-based methods. To tackle these challenges, we develop an alternative analytical approach based on a new concept called architecturally-determined complexity. Using this method, we asymptotically evaluate the runtime/storage/energy bounds of the two models, i.e., AP and Von Neumann. We further apply the method to gain more insights into the performance bottlenecks of traditional AP and develop a new machine model named Two Dimensional AP to address these limitations. Finally, we experimentally validate our analytical method and confirm that the simulation results match our theoretical projections.},
keywords={journal, Analytical models,Complexity theory,Computational modeling,Computer architecture,Parallel processing,Runtime,Two dimensional displays,Analysis of Algorithms and Problem Complexity,Associative Processors,Modeling techniques,Models of Computation}, 
doi={10.1109/LCA.2018.2789424}, 
ISSN={1556-6056}, 
}

@ARTICLE{zha2017CALCMA, 
author={Yue Zha and Jing Li}, 
journal={IEEE Computer Architecture Letters}, 
title={{CMA}: A Reconfigurable Complex Matching Accelerator for Wire-speed Network Intrusion Detection}, 
year={2017},
date={2017-07-03},
volume={17}, 
number={1}, 
pages={33-36}, 
keywords={journal, Computer architecture,Coprocessors,Encoding,IP networks,Intrusion detection,Ports (Computers),Accelerator,Intrusion Detection,Network Security,ReRAM,TCAM}, 
doi={10.1109/LCA.2017.2719023}, 
ISSN={1556-6056}, 
}

@ARTICLE{zha2017CALIMEC, 
author={Yue Zha and Jing Li}, 
journal={IEEE Computer Architecture Letters}, 
title={{IMEC}: A Fully Morphable In-Memory Computing Fabric Enabled by Resistive Crossbar}, 
year={2017}, 
volume={16}, 
number={2}, 
pages={123--126}, 
keywords={journal, Decoding,Energy efficiency,Field programmable gate arrays,Nonvolatile memory,Program processors,Non-volatile memory,TCAM,energy-efficiency computing,processing-in-memory}, 
doi={10.1109/LCA.2017.2672558}, 
ISSN={1556-6056}, 
month={Feb},
date={2017-02-22},
}

@ARTICLE{li2014jssc, 
author={Jing Li and Robert Montoye and Masatoshi Ishii and Leland Chang}, 
journal={IEEE Journal of Solid-State Circuits}, 
title={1 {Mb} 0.41 um^2 {2T-2R} cell nonvolatile {TCAM} with two-bit encoding and clocked self-referenced sensing (INVITED)}, 
year={2014}, 
volume={49}, 
number={4}, 
pages={896--907}, 
keywords={journal, content-addressable storage,encoding,phase change memories,2T 2R cell nonvolatile TCAM,CMOS technology,algorithmic mapping,clocked self referenced sensing,phase change memory technology,resistive memories,size 90 nm,time 1.9 ns,two bit encoding,Arrays,Encoding,Microprocessors,Phase change materials,Random access memory,Sensors,Associative computing,encoding,hardware accelerator,intrusion detection,matchline compensation,nonvolatile,packet classification,phase change memory (PCM),search engine,self-referenced sensing,ternary content addressable memory (TCAM)}, 
abstract={This work demonstrates the first fabricated 1 Mb nonvolatile TCAM using 2-transistor/2-resistive-storage (2T-2R) cells to achieve >10× smaller cell size than SRAM-based TCAMs at the same technology node. The test chip was designed and fabricated in IBM 90 nm CMOS technology and mushroom phase-change memory (PCM) technology. The primary challenge for enabling reliable array operation with such aggressive cell is presented, namely, severely degraded sensing margin due to significantly lower ON/OFF ratio of resistive memories (~10^2 for PCM) than that of traditional MOSFETs (>10^5 ). To address this challenge, two enabling techniques were developed and implemented in hardware: 1) two-bit encoding and 2) a clocked self-referenced sensing scheme (CSRSS). In addition, the two-bit encoding can also improve algorithmic mapping by effectively compressing TCAM entries. The 1 Mb chip demonstrates reliable low voltage search operation (VDDmin ~750 mV) and a match delay of 1.9 ns under nominal operating conditions.},
doi={10.1109/JSSC.2013.2292055}, 
ISSN={0018-9200}, 
month={April},
}

@article{cil2013thinfilm,
  title={Assisted cubic to hexagonal phase transition in GeSbTe thin films on silicon nitride},
  author={Cil, K and Zhu, Y and Li, Jing and Lam, CH and Silva, H},
  journal={Thin Solid Films},
  volume={536},
  pages={216--219},
  year={2013},
  publisher={Elsevier},
  issn = {0040-6090},
  doi = {10.1016/j.tsf.2013.03.087},
  url = {http://www.sciencedirect.com/science/article/pii/S0040609013005476},
  keywords = {Phase change memory, Germanium–antimony–tellurium, Phase transition temperature, Face-centered cubic, Hexagonal close-packed, Substrate dependence, Silicon nitride, Silicon dioxide},
  keywords={journal}
}

@ARTICLE{zhang2012TED, 
author={Xiao Zhang and Jerome Mitard and Lars-Ake Ragnarsson and Tomas Hoffmann and Michael Deal and Melody E. Grubbs and Jing Li and Blanka Magyari-Kope and Bruce M. Clemens and Yoshio Nishi}, 
journal={IEEE Transactions on Electron Devices}, 
title={Theory and Experiments of the Impact of Work-Function Variability on Threshold Voltage Variability in {MOS} Devices}, 
year={2012}, 
volume={59}, 
number={11}, 
pages={3124--3126}, 
keywords={journal, MOSFET,failure analysis,probability,random-access storage,semiconductor device models,semiconductor device reliability,MOS devices,MOSFET,WFV,grain orientation,polycrystalline metal gate,random dopant fluctuation,size 22 nm,static RAM failure probability,threshold voltage variability,work-function variability,Integrated circuit modeling,Logic gates,Random access memory,Resource description framework,Semiconductor device modeling,MOSFETS,Metal gate,variability,work function (WF)}, 
doi={10.1109/TED.2012.2212021}, 
ISSN={0018-9383}, 
month={Nov},
}

@article{cywar2012nano,
  author={Adam Cywar and Jing Li and Chung Lam and Helena Silva},
  title={The impact of heater-recess and load matching in phase change memory mushroom cells},
  journal={Nanotechnology},
  volume={23},
  number={22},
  pages={225201},
  url={http://stacks.iop.org/0957-4484/23/i=22/a=225201},
  year={2012},
  date={2012-05-10},
  doi={10.1088/0957-4484/23/22/225201},
  keywords={journal},
}

@Article{li2011sciencechina,
author={Li, Jing and Lam, Chung},
title={Phase change memory (INVITED)},
journal={Science China Information Sciences},
year={2011},
month={May},
day={01},
volume={54},
number={5},
pages={1061--1072},
abstract={Phase change memory (PCM) is a non-volatile solid-state memory technology based on the large resistivity contrast between the amorphous and crystalline states in phase change materials. We present the physics behind this large resistivity contrast and describe how it is being exploited to create high density PCM. We address the challenges facing this technology, including the design of PCM cells, fabrication, device variability, thermal cross-talk and write disturb. We discuss the scalability, assess the performance, and examine the reliability of PCM including data retention, multi-bit storage and endurance.},
issn={1869-1919},
doi={10.1007/s11432-011-4223-x},
url={https://doi.org/10.1007/s11432-011-4223-x},
keywords={journal}
}

@ARTICLE{li2010tvlsi, 
author={Jing Li and Patrick Ndai and Ashish Goel and Sayeef Salahuddin and Kaushik Roy}, 
journal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems}, 
title={Design Paradigm for Robust Spin-Torque Transfer Magnetic {RAM} ({STT} {MRAM}) From Circuit/Architecture Perspective (Best Paper)}, 
year={2010}, 
volume={18}, 
number={12}, 
pages={1710--1723}, 
keywords={journal, integrated circuit design,magnetic storage,random-access storage,high memory yield,parametric failures,process variations,robust spin-torque transfer magnetic RAM,Circuit stability,Costs,Failure analysis,Flash memory,Magnetic circuits,Performance analysis,Random access memory,Read-write memory,Robustness,Scalability,Spin-torque transfer (STT),magnetic ram (MRAM),memory yield,parametric failures}, 
doi={10.1109/TVLSI.2009.2027907}, 
ISSN={1063-8210}, 
month={Dec},
}

@ARTICLE{chen2010tvlsi, 
author={Yiran Chen and Hai Li and Cheng-Kok Koh and Guangyu Sun and Jing Li and Yuan Xie and Kaushik Roy}, 
journal={IEEE Transactions on Very Large Scale Integration (VLSI) Systems}, 
title={Variable-Latency Adder ({VL-Adder}) Designs for Low Power and {NBTI} Tolerance}, 
year={2010}, 
volume={18}, 
number={11}, 
pages={1621--1624}, 
keywords={journal, adders,digital arithmetic,integrated circuit design,logic design,IC design,NBTI tolerance,circuit delay,digital arithmetic,logic design,negative bias temperature instability,variable-latency adder designs,word length 64 bit,Adders,Circuits,Clocks,Delay,Negative bias temperature instability,Niobium compounds,Sun,Throughput,Titanium compounds,Very large scale integration,Digital arithmetic,IC design,logic design}, 
doi={10.1109/TVLSI.2009.2026280}, 
ISSN={1063-8210}, 
month={Nov},
}

@ARTICLE{li2009tcad, 
author={Jing Li and Kunhyuk Kang and Kaushik Roy}, 
journal={IEEE Transactions on Computer-Aided Design of Integrated Circuits and Systems}, 
title={Variation Estimation and Compensation Technique in Scaled {LTPS} {TFT} Circuits for Low-Power Low-Cost Applications}, 
year={2009}, 
volume={28}, 
number={1}, 
pages={46--59}, 
keywords={journal, CMOS integrated circuits,circuit reliability,elemental semiconductors,low-power electronics,silicon,statistical analysis,thin film transistors,CMOS technology,Si,circuit reliability,compensation technique,delay variation,four-finger structure,inverter chain,low-power low-cost application,low-temperature polycrystalline-silicon thin-film transistor,multifinger design technique,multimodal delay distribution,response surface method,statistical simulation methodology,unimodal distribution,variation estimation,CMOS logic circuits,CMOS technology,Circuit simulation,Delay,Grain boundaries,Logic devices,Response surface methodology,Robustness,Substrates,Thin film transistors,Grain boundary (GB),low-temperature polycrystalline-silicon (LTPS),process variation,thin-film transistor (TFT)}, 
doi={10.1109/TCAD.2008.2009149}, 
ISSN={0278-0070}, 
month={Jan},
keywords={journal}}

@article{li2008jetc,
 author = {Li, Jing and Bansal, Aditya and Ghosh, Swarop and Roy, Kaushik},
 title = {An Alternate Design Paradigm for Low-power, Low-cost, Testable Hybrid Systems Using Scaled {LTPS} {TFTs} (INVITED)},
 journal = {J. Emerg. Technol. Comput. Syst.},
 issue_date = {August 2008},
 volume = {4},
 number = {3},
 month = {Aug},
 year = {2008},
 issn = {1550-4832},
 pages = {13:1--13:19},
 articleno = {13},
 numpages = {19},
 url = {http://doi.acm.org/10.1145/1389089.1389093},
 doi = {10.1145/1389089.1389093},
 acmid = {1389093},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {3D integration, BIST, DFT, Low-temperature polycrystalline silicon (LTPS), generic, grain boundary (GB), hybrid system, inherent variation, reconfigurable, thin-film transistor (TFT)},
 keywords={journal}
} 

@ARTICLE{li2007ted, 
author={Jing Li and Aditya Bansal and Kaushik Roy}, 
journal={IEEE Transactions on Electron Devices}, 
title={{Poly-Si} Thin-Film Transistors: An Efficient and Low-Cost Option for Digital Operation}, 
year={2007}, 
volume={54}, 
number={11}, 
pages={2918-2929}, 
keywords={journal, elemental semiconductors,low-power electronics,silicon,silicon-on-insulator,thin film transistors,LTPS TFT,SOI,Si - Interface,driving current,low-temperature polycrystalline-silicon thin-film transistors,midgap trap density,poly-Si thin-film transistors,silicon-on-insulator,single-crystalline silicon,submicrometer ultralow-power digital operation,ultralow-power subthreshold operation,Costs,Design methodology,Design optimization,Energy consumption,Fabrication,Glass,Polymers,Silicon,Substrates,Thin film transistors,Grain boundary (GB),low-pressure chemical vapor deposition (LPCVD),low-temperature polycrystalline silicon (LTPS),thin-film transistor (TFT)}, 
doi={10.1109/TED.2007.906940}, 
ISSN={0018-9383}, 
month={Nov},
}

%%%%%%%%%% Referred conference %%%%%%%%%%
@inproceedings{zha2020asplos,
 author = {Zha, Yue and Li, Jing},
 title = {{ViTAL: Virtualizing FPGAs in the Cloud}},
 booktitle = {the 24th ACM International Conference on Architectural Support for Programming Languages and Operating Systems},
 series = {**ASPLOS** '20},
 year = {2020},
 pubstate = {upcoming},
 note = {},
 keywords = {conference}
}


@inproceedings{luo2019vlsit,
 author = {Luo, Qing and Yu, Jie and Zhang, Xumeng and Xue, Kan-Hao and Cheng, Yan and Gong, Tiancheng and Lv, Hangbing and Xu, Xiaoxin and Yuan, Peng and Yin, Jiahao and Tai, Lu and Long, Shibing and Liu, Qi and Li, Jing and Liu, Ming},
 title = {Nb\textsubscript{1-x}O\textsubscript{2} based Universal Selector with Ultra-high Endurance (>10\textsuperscript{12}), high speed (10ns) and Excellent V\textsubscript{th} Stability},
 booktitle = {2019 IEEE Symposium on VLSI Technology},
 year = {2019},
 date={2019-06-09},
 month={Jun},
 note = {},
 doi={10.23919/VLSIT.2019.8776546},
 keywords = {conference}
}

@inproceedings{zha2019vlsic,
 author = {Zha, Yue and Nowak, Etienne and Li, Jing},
 title = {{Liquid Silicon}: A Nonvolatile Fully Programmable Processing-In-Memory Processor with Monolithically Integrated {ReRAM} for {Big Data/Machine Learning} Applications},
 booktitle = {2019 IEEE Symposium on VLSI Circuits},
 year = {2019},
 month = {Jun},
 date={2019-06-09},
 note = {},
 doi={10.23919/VLSIC.2019.8778064},
 abstract={A nonvolatile fully programmable processing-in-memory (PIM) processor named Liquid Silicon (L-Si) is demonstrated, which combines the superior programmability of general-purpose computing devices (e.g. FPGA) and the high power efficiency of domain-specific accelerators. Besides the general computing applications, L-Si is particularly well suited for AI/machine learning and big data applications, which not only pose high computational/memory demand but also evolves rapidly. L-Si is fabricated by monolithically integrating HfO 2 resistive RAM on top of commercial 130nm Si CMOS. Our measurement confirmed the fabricated chip operates reliably at low voltage of 650 mV. It achieves 60.9 TOPS/W in performing neural network inferences and 480 GOPS/W in performing content-based similarity search (a key big data application) at nominal voltage supply of 1.2V, showing >3× and ~100× power efficiency improvement over the state-of-the-art domain-specific CMOS-/RRAM-based accelerators. In addition, it outperforms the latest nonvolatile FPGA in energy efficiency by ~3× in general compute-intensive applications.},
 keywords = {conference}
}

@inproceedings{zhang2019fccm,
 author = {Zhang, Jialiang and Liu, Yang and Jain, Gaurav and Zha, Yue and Ta, Jonathan and Li, Jing},
 title = {{MEG}: {A RISCV}-based system simulation infrastructure for exploring memory optimization using {FPGAs} and {Hybrid Memory Cube} (Best Paper Nominee)},
 booktitle = {2019 IEEE 27th Annual International Symposium on Field-Programmable Custom Computing Machines (**FCCM**)},
 year = {2019},
 month = {April},
 date={2019-04-28},
 doi={0.1109/FCCM.2019.00029},
 abstract={Emerging 3D memory technologies, such as the Hybrid Memory Cube (HMC) and High Bandwidth Memory (HBM), provide increased bandwidth and massive memory-level parallelism. Efficiently integrating emerging memories into existing system pose new challenges and require detailed evaluation in a real computing environment. In this paper, we propose MEG, an open-source, configurable, cycle-exact, and RISC-V based full system simulation infrastructure using FPGA and HMC. MEG has three highly configurable design components: (i) a HMC adaptation module that not only enables communication between the HMC device and the processor cores but also can be extended to fit other memories (e.g., HBM, nonvolatile memory) with minimal effort, (ii) a reconfigurable memory controller along with its OS support that can be effectively leveraged by system designers to perform software-hardware co-optimization, and (iii) a performance monitor module that effectively improves the observability and debuggability of the system to guide performance optimization. We provide a prototype implementation of MEG on Xilinx VCU110 board and demonstrate its capability, fidelity, and flexibility on real-world benchmark applications. We hope that our open-source release of MEG fills a gap in the space of publicly-available FPGA-based full system simulation infrastructures specifically targeting memory system and inspires further collaborative software/hardware innovations.},
 note = {},
 keywords = {conference}
}

@inproceedings{zhang2019fpga,
 author = {Zhang, Jialiang and Li, Jing},
 title = {{Unleashing the Power of Soft Logic for Convolutional Neural Network Acceleration via Product Quantization} (Poster)},
 booktitle = {the 2019 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
  series = {**FPGA** '19},
 year = {2019},
 month = {Feb},
 date={2019-02-24},
 doi={10.1145/3289602.3293951},
 note = {},
 keywords = {conference}
}

@inproceedings{zhang2018fccm,
 author = {Zhang, Jialiang and Li, Jing},
 title = {{PQ-CNN}: {Accelerating} Product Quantized Convolutional Neural Network (Poster)},
 booktitle = {2018 IEEE 26th Annual International Symposium on Field-Programmable Custom Computing Machines (**FCCM**)},
 year = {2018},
 month = {April},
 date={2018-04-29},
 doi={10.1109/FCCM.2018.00041},
 note = {},
 keywords = {conference}
}

@inproceedings{zhang2018cvpr,
 author = {Zhang, Jialiang and Khoram, Soroosh and Li, Jing},
 title = {Efficient Large-scale Approximate Nearest Neighbor Search on the {OpenCL-FPGA}},
 booktitle = {Conference on Computer Vision and Pattern Recognition (**CVPR**)},
 year = {2018},
 month={Jun},
 date={2018-06},
 abstract={We present a new method for Product Quantization (PQ) based approximated nearest neighbor search (ANN) in high dimensional spaces. Specifically, we first propose a quantization scheme for the codebook of coarse quantizer, product quantizer, and rotation matrix, to reduce the cost of accessing these codebooks. Our approach also combines a highly parallel k-selection method, which can be fused with the distance calculation to reduce the memory overhead. We implement the proposed method on Intel HARPv2 platform using OpenCL-FPGA. The proposed method significantly outperforms state-of-the-art methods on CPU and GPU for high dimensional nearest neighbor queries on billion-scale datasets in terms of query time and accuracy regardless of the batch size. To our best knowledge, this is the first work to demonstrate FPGA performance superior to CPU and GPU on high-dimensional, large-scale ANN datasets.},
 doi={DOI: 10.1109/CVPR.2018.00517},
 pages={4924--4932}, 
 note = {(Acceptance Rate: \underline{29\%}, 979 out of over 3300)},
 keywords = {conference}
}

@inproceedings{khoram2018iclr,
 author = {Khoram, Soroosh and Li, Jing},
 title = {Adaptive Quantization of Neural Networks},
 booktitle = {International Conference on Learning Representations (**ICLR**)},
 year = {2018},
 month={April},
 date={2018-04},
 abstract={Despite the state-of-the-art accuracy of Deep Neural Networks (DNN) in various classification problems, their deployment onto resource constrained edge computing devices remains challenging due to their large size and complexity. Several recent studies have reported remarkable results in reducing this complexity through quantization of DNN models. However, these studies usually do not consider the changes in the loss function when performing quantization, nor do they take the different importances of DNN model parameters to the accuracy into account. We address these issues in this paper by proposing a new method, called adaptive quantization, which simplifies a trained DNN model by finding a unique, optimal precision for each network parameter such that the increase in loss is minimized. The optimization problem at the core of this method iteratively uses the loss function gradient to determine an error margin for each parameter and assigns it a precision accordingly. Since this problem uses linear functions, it is computationally cheap and, as we will show, has a closed-form approximate solution. Experiments on MNIST, CIFAR, and SVHN datasets showed that the proposed method can achieve near or better than state-of-the-art reduction in model size with similar error rates. Furthermore, it can achieve compressions close to floating-point model compression methods without loss of accuracy.},
 url={https://openreview.net/forum?id=SyOK1Sg0W},
 keywords = {conference},
note = {(Acceptance Rate: \underline{34\%}, 314 out of 935)}
}

@INPROCEEDINGS{li2018CSTIC, 
author={Li,Jing}, 
booktitle={2018 China Semiconductor Technology International Conference (CSTIC)}, 
title={Nonvolatile Memory Outlook: Technology Driven or Application Driven? (INVITED)},
year={2018}, 
date = {2018-03-12},
volume={}, 
number={}, 
pages={1--4}, 
ISSN={}, 
month={March},
keywords = {conference}
}

@inproceedings{zha2018asplos,
 author = {Zha, Yue and Li, Jing},
 title = {{Liquid Silicon-Monona}: A Reconfigurable Memory-Oriented Computing Fabric with Scalable Multi-Context Support},
 booktitle = {23nd International Conference on Architectural Support for Programming Languages and Operating Systems},
 series = {**ASPLOS** '18},
 year = {2018},
 month={Mar},
 date={2018-03-19},
 location = {Williamsburg, VA, USA},
 pages = {214--228},
 volume={53},
 issue={2},
 url = {http://doi.acm.org/10.1145/3173162.3173167},
 doi = {10.1145/3173162.3173167},
 acmid = {},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {},
 abstract={With the recent trend of promoting Field-Programmable Gate Arrays (FPGAs) to first-class citizens in accelerating compute-intensive applications in networking, cloud services and artificial intelligence, FPGAs face two major challenges in sustaining competitive advantages in performance and energy efficiency for diverse cloud workloads: 1) limited configuration capability for supporting light-weight computations/on-chip data storage to accelerate emerging search-/data-intensive applications. 2) lack of architectural support to hide reconfiguration overhead for assisting virtualization in a cloud computing environment. In this paper, we propose a reconfigurable memory-oriented computing fabric, namely Liquid Silicon-Monona (L-Si), enabled by emerging nonvolatile memory technology i.e. RRAM, to address these two challenges. Specifically, L-Si addresses the first challenge by virtue of a new architecture comprising a 2D array of physically identical but functionally-configurable building blocks. It, for the first time, extends the configuration capabilities of existing FPGAs from computation to the whole spectrum ranging from computation to data storage. It allows users to better customize hardware by flexibly partitioning hardware resources between computation and memory, greatly benefiting emerging search- and data-intensive applications. To address the second challenge, L-Si provides scalable multi-context architectural support to minimize reconfiguration overhead for assisting virtualization. In addition, we provide compiler support to facilitate the programming of applications written in high-level programming languages (e.g. OpenCL) and frameworks (e.g. TensorFlow, MapReduce) while fully exploiting the unique architectural capability of L-Si. Our evaluation results show L-Si achieves 99.6\% area reduction, 1.43× throughput improvement and 94.0\% power reduction on search-intensive benchmarks, as compared with the FPGA baseline. For neural network benchmarks, on average, L-Si achieves 52.3× speedup, 113.9× energy reduction and 81\% area reduction over the FPGA baseline. In addition, the multi-context architecture of L-Si reduces the context switching time to - 10ns, compared with an off-the-shelf FPGA (∼100ms), greatly facilitating virtualization.},
 keywords = {conference},
 note = {(Acceptance Rate: \underline{18.2\%}, 56 out of 307)}
} 

@inproceedings{zhang2018fpga,
 author = {Zhang, Jialiang and Li, Jing},
 title = {Degree-aware Hybrid Graph Traversal on {FPGA-HMC} Platform},
 booktitle = {Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 series = {**FPGA** '18},
 year = {2018},
 month={Feb},
 date={2018-02-25},
 pages = {229--238},
 location = {Monterey, California, USA},
 publisher = {ACM},
 address = {New York, NY, USA},
 url = {http://doi.acm.org/10.1145/3174243.3174245},
 doi = {10.1145/3174243.3174245},
 keywords = {conference, graph processor, hybrid memory cube, bfs},
 abstract={Graph traversal is a core primitive for graph analytics and a basis for many higher-level graph analysis methods. However, irregularities in the structure of scale-free graphs (e.g., social network) limit our ability to analyze these important and growing datasets. A key challenge is the redundant graph computations caused by the presence of high-degree vertices which not only increase the total amount of computations but also incur unnecessary random data access. In this paper, we present a graph processing system on an FPGA-HMC platform, based on software/hardware co-design and co- optimization. For the first time, we leverage the inherent graph property i.e. vertex degree to co-optimize algorithm and hardware architecture. In particular, we first develop two algorithm optimization techniques:degree-aware adjacency list reordering anddegree-aware vertex index sorting. The former can reduce the number of redundant graph computations, while the latter can create a strong correlation between vertex index and data access frequency, which can be effectively applied to guide the hardware design. We further implement the optimized hybrid graph traversal algorithm on an FPGA-HMC platform. By leveraging the strong correlation between vertex index and data access frequency made by degree-aware vertex index sorting, we develop two platform-dependent hardware optimization techniques, namely degree-aware data placement and degree-aware adjacency list compression. These two techniques together substantially reduce the amount of access to external memory. Finally, we conduct extensive experiments on an FPGA-HMC platform to verify the effectiveness of the proposed techniques. To the best of our knowledge, our implementation achieves the highest performance (45.8 billion traversed edges per second) among existing FPGA-based graph processing systems.},
 note = {(Acceptance Rate*: \underline{24\%})}
} 

@inproceedings{zha2018fpga,
 author = {Zha, Yue and Li, Jing},
 title = {{Liquid  Silicon}:  A Data-Centric Reconfigurable Architecture enabled by {RRAM} Technology},
 booktitle = {Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 series = {**FPGA** '18},
 year = {2018},
 month={Feb},
 date={2018-02-25},
 pages = {51--60},
 url = {http://doi.acm.org/10.1145/3174243.3174244},
 doi = {10.1145/3174243.3174244},
 location = {Monterey, California, USA},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {conference, monolithic stacking, non-volatile memory, processing-in-memory, reconfigurable architecture, tcam},
 abstract={This paper presents a data-centric reconfigurable architecture, namely Liquid Silicon, enabled by emerging non-volatile memory, i.e., RRAM. Compared to the heterogeneous architecture of commercial FPGAs, Liquid Silicon is inherently a homogeneous architecture comprising a two-dimensional (2D) array of identical 'tiles'. Each tile can be configured into one or a combination of four modes: TCAM, logic, interconnect, and memory. Such flexibility allows users to partition resources based on applications? needs, in contrast to the fixed hardware design using dedicated hard IP blocks in FPGAs. In addition to better resource usage, its 'memory friendly' architecture effectively addresses the limitations of commercial FPGAs i.e., scarce on-chip memory resources, making it an effective complement to FPGAs. Moreover, its coarse-grained logic implementation results in shallower logic depth, less inter-tile routing overhead, and thus smaller area and better performance, compared with its FPGA counterpart. Our study shows that, on average, for both traditional and emerging applications, we achieve 62\% area reduction, 27\% speedup and 31\% improvement in energy efficiency when mapping applications onto Liquid Silicon instead of FPGAs.},
 note = {(Acceptance Rate*: \underline{24\%}, Ranked **\#1** among 100+ submissions)}
} 


@inproceedings{khoram2018fpga,
 author = {Khoram, Soroosh and Zhang, Jialiang and Strange, Maxwell and Li, Jing},
 title = {Accelerating  Graph  Analytics  By  Co-Optimizing  Storage  and  Access  on  an {FPGA-HMC} Platform},
 booktitle = {Proceedings of the 2018 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 series = {**FPGA** '18},
 year = {2018},
 month={Feb},
 date={2018-02-25},
 pages = {239--248},
 url = {http://doi.acm.org/10.1145/3174243.3174260},
 doi = {10.1145/3174243.3174260},
 location = {Monterey, California, USA},
 publisher = {ACM},
 address = {New York, NY, USA},
 abstract={Graph analytics, which explores the relationships among interconnected entities, is becoming increasingly important due to its broad applicability, from machine learning to social sciences. However, due to the irregular data access patterns in graph computations, one major challenge for graph processing systems is performance. The algorithms, softwares, and hardwares that have been tailored for mainstream parallel applications are generally not effective for massive, sparse graphs from the real-world problems, due to their complex and irregular structures. To address the performance issues in large-scale graph analytics, we leverage the exceptional random access performance of the emerging Hybrid Memory Cube (HMC) combined with the flexibility and efficiency of modern FPGAs. In particular, we develop a collaborative software/hardware technique to perform a level-synchronized Breadth First Search (BFS) on a FPGA-HMC platform. From the software perspective, we develop an architecture-aware graph clustering algorithm that exploits the FPGA-HMC platform»s capability to improve data locality and memory access efficiency. From the hardware perspective, we further improve the FPGA-HMC graph processor architecture by designing a memory request merging unit to take advantage of the increased data locality resulting from graph clustering. We evaluate the performance of our BFS implementation using the AC-510 development kit from Micron and achieve $2.8 \times$ average performance improvement compared to the latest FPGA-HMC based graph processing system over a set of benchmarks from a wide range of applications.},
 keywords = {conference, graph analytics, graph clustering, hardware accelerators, hybrid memory cube, reconfigurable logic, bfs},
 note = {(Acceptance Rate*: \underline{24\%})}
}

@INPROCEEDINGS{zha2017iccad, 
author={Yue Zha and Jing Li}, 
booktitle={2017 IEEE/ACM International Conference on Computer-Aided Design (ICCAD)}, 
series = {**ICCAD** '17},
title={{RRAM-based} reconfigurable in-memory computing architecture with hybrid routing}, 
year={2017}, 
month={Nov},
date={2017-11-13},
volume={}, 
number={}, 
pages={527--532}, 
keywords={conference, Architecture,Delays,Field programmable gate arrays,Logic functions,Routing,Switches,Tiles,CAD Framework,Hybrid Routing,In-Memory Computing,Reconfigurable Architecture,liquid Silicon}, 
abstract={Recent advances in resistive random-access memory (RRAM) evoke great interests in exploring alternative architectures. One interesting work is a RRAM-based reconfigurable architecture that provides superior programmbility and blurs the boundary between computation and storage, but long-distance routing becomes a performance bottleneck. However, long-distance routing in FPGA is efficiently implemented, but its fine-grained routing structure results in a large routing overhead. In this work, we present a RRAM-based reconfigurable architecture that addresses the routing challenges using hybrid routing, i.e., local and global routing by taking the best advantages of both architectures (prior RRAM-based and FPGA). We also provide a complete CAD framework that exhibits high parallelism and good scalability. Experimental results show that our reconfigurable architecture outperforms both architectures. It achieves a 46.88\% reduction in delay and improves the energy efficiency by 66.23\% compared with the prior RRAM-based architecture with a slightly increased area overhead. While comparing with FPGA, it reduces the delay and the routing overhead by 36.00\% and 50.20\%, respectively. Additionally, our CAD framework achieves 5.39x speedup, compared with the prior framework.},
doi={10.1109/ICCAD.2017.8203822}, 
ISSN={1558-2434}, 
note = {(Acceptance Rate: \underline{26\%}, 105 out of 399)},
}

@INPROCEEDINGS{khoram2017fccm, 
author={Soroosh Khoram and Jialiang Zhang and Maxwell Strange and Jing Li}, booktitle={2017 IEEE 25th Annual International Symposium on Field-Programmable Custom Computing Machines (**FCCM**)}, 
title={Accelerating Large-Scale Graph Analytics with {FPGA} and {HMC} (Poster)}, 
year={2017},
date={2017-04-30},
volume={}, 
number={82--82}, 
pages={82--82}, 
keywords={conference, field programmable gate arrays,graph theory,information retrieval,learning (artificial intelligence),social sciences,tree searching,BFS,FPGA-HMC based graph processing system,breadth first search,hybrid memory cube,interconnected entities,irregular data access pattern,large-scale graph analytics,machine learning,massive-scale sparse graphs,social science,Acceleration,Clustering algorithms,Field programmable gate arrays,Hardware,Merging,Software,Software algorithms,Breadth-First Search,Graph Clustering,Hybrid memory Cube}, 
doi={10.1109/FCCM.2017.58}, 
ISSN={}, 
month={April},
note = {Acceptance rate: \underline{25\%}, 32 out of 128},
}


@inproceedings{khoram2017ISPD,
 author = {Khoram, Soroosh and Zha, Yue and Zhang, Jialiang and Li, Jing},
 title = {Challenges and Opportunities: From Near-memory Computing to In-memory Computing (INVITED)},
 booktitle = {Proceedings of the 2017 ACM on International Symposium on Physical Design},
 series = {**ISPD** '17},
 year = {2017},
 month = {Mar},
 date={2017-03-19},
 isbn = {978-1-4503-4696-2},
 location = {Portland, Oregon, USA},
 pages = {43--46},
 numpages = {4},
 url = {http://doi.acm.org/10.1145/3036669.3038242},
 doi = {10.1145/3036669.3038242},
 acmid = {3038242},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {conference, 3d integration, in-memory processing, near-memory processing, nonvolatile memory},
 abstract={The confluence of the recent advances in technology and the ever-growing demand for large-scale data analytics created a renewed interest in a decades-old concept, processing-in-memory (PIM). PIM, in general, may cover a very wide spectrum of compute capabilities embedded in close proximity to or even inside the memory array. In this paper, we present an initial taxonomy for dividing PIM into two broad categories: 1) Near-memory processing and 2) In-memory processing. This paper highlights some interesting work in each category and provides insights into the challenges and possible future directions.},
 note = {(Acceptance Rate*: \underline{35\%})}
} 


@INPROCEEDINGS{zha2017CSTIC, 
author={Yue Zha and Zhiqiang Wei and Jing Li}, 
booktitle={2017 China Semiconductor Technology International Conference (CSTIC)}, 
title={Recent progress in {RRAM} technology: From compact models to applications (INVITED)}, 
year={2017}, 
volume={}, 
number={},
date={2017-03-12},
pages={1--4}, 
keywords={conference, integrated circuit modelling,product development,resistive RAM,IV characteristics,RRAM technology,SCM,commercialization progress,compact model,drop-in replacement,embedded memory,essential electrical-chemical-thermal properties,nonVon Neumann architecture,product development,standalone memory,storage class memory,switching dynamics,Computational modeling,Computer architecture,Hidden Markov models,Mathematical model,Random access memory,Resistance,Switches}, 
doi={10.1109/CSTIC.2017.7919731}, 
ISSN={}, 
month={March},
}

@inproceedings{zha2017dacwip,
 author = {Zha, Yue and Li, Jing},
 title = {{RRAM}-based  Reconfigurable  In-Memory  Computing  Architecture with Hybrid Routing (poster)},
 booktitle = {the 54th Annual Design Automation Conference Work-in-Progress},
 series = {DAC-WIP '17},
 year = {2017},
 date = {2017-06},
 month = {Jun},
 isbn = {978-1-4503-4927-7},
 location = {Austin, TX, USA},
 address = {New York, NY, USA},
 keywords = {conference},
 note = {(Acceptance Rate*: \underline{29\%})},
} 

@inproceedings{zhang2017fpgaCNN,
 author = {Zhang, Jialiang and Li, Jing},
 title = {Improving the Performance of {OpenCL-based FPGA} Accelerator for Convolutional Neural Network},
 booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 series = {**FPGA** '17},
 year = {2017},
 date={2017-02-22},
 isbn = {978-1-4503-4354-1},
 location = {Monterey, California, USA},
 pages = {25--34},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/3020078.3021698},
 doi = {10.1145/3020078.3021698},
 acmid = {3021698},
 publisher = {ACM},
 address = {New York, NY, USA},
 abstract={OpenCL FPGA has recently gained great popularity with emerging needs for workload acceleration such as Convolutional Neural Network (CNN), which is the most popular deep learning architecture in the domain of computer vision. While OpenCL enhances the code portability and programmability of FPGA, it comes at the expense of performance. The key challenge is to optimize the OpenCL kernels to efficiently utilize the flexible hardware resources in FPGA. Simply optimizing the OpenCL kernel code through various compiler options turns out insufficient to achieve desirable performance for both compute-intensive and data-intensive workloads such as convolutional neural networks.

In this paper, we first propose an analytical performance model and apply it to perform an in-depth analysis on the resource requirement of CNN classifier kernels and available resources on modern FPGAs. We identify that the key performance bottleneck is the on-chip memory bandwidth. We propose a new kernel design to effectively address such bandwidth limitation and to provide an optimal balance between computation, on-chip, and off-chip memory access. As a case study, we further apply these techniques to design a CNN accelerator based on the VGG model. Finally, we evaluate the performance of our CNN accelerator using an Altera Arria 10 GX1150 board. We achieve 866 Gop/s floating point performance at 370MHz working frequency and 1.79 Top/s 16-bit fixed-point performance at 385MHz. To the best of our knowledge, our implementation achieves the best power efficiency and performance density compared to existing work.},
 keywords = {conference, convolutional neural networks, fpga, hardware accelerator, opencl},
 note = {(Acceptance Rate: \underline{25\%}, 25 out of 101)},
} 


@inproceedings{zhang2017fpgaBFS,
 author = {Zhang, Jialiang and Khoram, Soroosh and Li, Jing},
 title = {Boosting the Performance of {FPGA-based} Graph Processor Using {Hybrid Memory Cube}: A Case for Breadth First Search},
 booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 series = {**FPGA** '17},
 year = {2017},
 date={2017-02-22},
 isbn = {978-1-4503-4354-1},
 location = {Monterey, California, USA},
 pages = {207--216},
 numpages = {10},
 url = {http://doi.acm.org/10.1145/3020078.3021737},
 doi = {10.1145/3020078.3021737},
 acmid = {3021737},
 publisher = {ACM},
 address = {New York, NY, USA},
 abstract={Large graph processing has gained great attention in recent years due to its broad applicability from machine learning to social science. Large real-world graphs, however, are inherently difficult to process efficiently, not only due to their large memory footprint, but also that most graph algorithms entail memory access patterns with poor locality and a low compute-to-memory access ratio. In this work, we leverage the exceptional random access performance of emerging Hybrid Memory Cube (HMC) technology that stacks multiple DRAM dies on top of a logic layer, combined with the flexibility and efficiency of FPGA to address these challenges. To our best knowledge, this is the first work that implements a graph processing system on a FPGA-HMC platform based on software/hardware co-design and co-optimization. We first present the modifications of algorithm and a platform-aware graph processing architecture to perform level-synchronized breadth first search (BFS) on FPGA-HMC platform. To gain better insights into the potential bottlenecks of proposed implementation, we develop an analytical performance model to quantitatively evaluate the HMC access latency and corresponding BFS performance. Based on the analysis, we propose a two-level bitmap scheme to further reduce memory access and perform optimization on key design parameters (e.g. memory access granularity). Finally, we evaluate the performance of our BFS implementation using the AC-510 development kit from Micron. We achieved 166 million edges traversed per second (MTEPS) using GRAPH500 benchmark on a random graph with a scale of 25 and an edge factor of 16, which significantly outperforms CPU and other FPGA-based large graph processors.},
 keywords = {conference, graph processor, hybrid memory cube:bfs},
 note = {(Acceptance Rate: \underline{25\%}, 25 out of 101)}
} 


@inproceedings{zha2017FPGAposter,
 author = {Zha, Yue and Zhang, Jialiang and Wei, Zhiqiang and Li, Jing},
 title = {A Mixed-Signal Data-Centric Reconfigurable Architecture Enabled by {RRAM} Technology (poster)},
 booktitle = {Proceedings of the 2017 ACM/SIGDA International Symposium on Field-Programmable Gate Arrays},
 series = {**FPGA** '17},
 year = {2017},
 date={2017-02-22},
 isbn = {978-1-4503-4354-1},
 location = {Monterey, California, USA},
 pages = {285--285},
 numpages = {1},
 url = {http://doi.acm.org/10.1145/3020078.3021759},
 doi = {10.1145/3020078.3021759},
 acmid = {3021759},
 publisher = {ACM},
 address = {New York, NY, USA},
 keywords = {conference, coarse-grained configuration, mixed-signal processing, non-volatile memory, reconfigurable architecture, ternary content addressable memory},
 note = {(Acceptance Rate: \underline{25\%}, 25 out of 101)},
} 


@inproceedings{zha2016ICCAD,
title = {Reconfigurable in-memory computing with resistive memory crossbar},
author = {Yue Zha and Jing Li},
url = {http://dl.acm.org/citation.cfm?id=2967069},
doi = {10.1145/2966986.2967069},
isbn = {978-1-4503-4466-1 },
year = {2016},
date = {2016-11-07},
booktitle = {Proceedings of the 35th International Conference on Computer-Aided Design},
location = {Austin, Texas},
pages = {120:1--120:8},
series = {**ICCAD** '16},
organization = {ACM},
keywords = {conference, RRAM, in-memory computing, reconfigurable},
abstract={Driven by recent advances in resistive random-access memory (RRAM), there have been growing interests in exploring alternative computing concept, i.e., in-memory processing, to address the classical von Neumann bottlenecks. Despite of their great promise in improving performance and energy efficiency, most existing works are built on the inherent matrix-vector multiplication capability of RRAM crossbar structure, and thus lack the flexibility to adapt to future market/technology induced changes in data-intensive applications. To address these challenges, we propose an in-memory reconfigurable architecture based on RRAM crossbar structure. For the first time, it achieves a full programmability across computation and storage, and thereby provides more flexibilities of partitioning the hardware resources based on applications' needs. We further develop two complete CAD design flows to facilitate development of applications written in hardware description languages (HDLs) for our architecture, based on: 1) adaption from existing tool set developed for FPGA, 2) a custom tool design optimized towards the new architecture. Our experiments show that, both design flows are effective in exploiting flexible resources offered by our architecture and thus achieves better efficiency than state-of-art FPGAs (30\% improvement in performance with 66\% reduction in area). In addition, compared to adapted design flow, our custom design flow achieves speedup by 3.3×, and further improves mapping quality.},
note = {(Acceptance Rate: \underline{24\%}, 97 out of 408)}
}


@INPROCEEDINGS{xu2016vlsi, 
author={Xiaoxin Xu and Qing Luo and Tiancheng Gong and Hangbing Lv and Shibing Long and Qi Liu and Steve S. Chung and Jing Li and Ming Liu}, 
booktitle={2016 IEEE Symposium on VLSI Technology}, 
title={Fully {CMOS} compatible {3D} vertical {RRAM} with self-aligned self-selective cell enabling sub-5nm scaling}, 
year={2016}, 
date={2016-06},
volume={}, 
number={}, 
pages={1--2}, 
keywords={conference, CMOS memory circuits,integrated circuit manufacture,resistive RAM,CMOS,RRAM,self-aligned self-selective cell,size 5 nm,vertical resistive switching memory,Etching,Hafnium compounds,Leakage currents,Programming,Resistance,Three-dimensional displays,Threshold voltage}, 
doi={10.1109/VLSIT.2016.7573388}, 
ISSN={}, 
month={June},}

@INPROCEEDINGS{guan2016irps, 
author={Bochen Guan and Jing Li}, 
booktitle={2016 IEEE International Reliability Physics Symposium (IRPS)}, 
title={A compact model for {RRAM} including random telegraph noise}, 
year={2016}, 
date={2016-04},
volume={}, 
number={}, 
pages={MY-5-1--MY-5-4}, 
keywords={conference, Monte Carlo methods,current fluctuations,electromagnetic interference,integrated circuit design,integrated circuit reliability,random noise,resistive RAM,telegraphy,Monte Carlo method,RRAM circuit reliability,RRAM compact model,RTN effect,current fluctuation,random telegraph noise,tunneling gap,Current measurement,Data models,Electron traps,Fluctuations,Integrated circuit modeling,Mathematical model,Switches,Compact model,RRAM,Random Telegraph Noise}, 
doi={10.1109/IRPS.2016.7574621}, 
ISSN={}, 
month={April},}

@INPROCEEDINGS{luo2015iedm, 
author={Qing Luo and Xiaoxin Xu and Hongtao Liu and Hangbing Lv and Tiancheng Gong and Shibing Long and Qi Liu and Haitao Sun and Writam Banerjee and Ling Li and Jianfeng Gao and Nianduan Lu and Steve S. Chung and Jing Li and Ming Liu}, 
booktitle={2015 IEEE International Electron Devices Meeting (**IEDM**)}, 
title={Demonstration of 3D vertical {RRAM} with ultra low-leakage, high-selectivity and self-compliance memory cells}, 
year={2015}, 
date={2015-12},
volume={}, 
number={}, 
pages={10.2.1--10.2.4}, 
keywords={conference, hafnium compounds,ionic conductivity,leakage currents,mixed conductivity,resistive RAM,3D vertical RRAM,HfO2,HfO2/mixed ionic and electronic conductor bilayer,four-layer V-RRAM array,high selectivity,nonlinearity,operation current,self-compliance memory cells,self-selective cell,ultra low-leakage,ultra-low half-select leakage,Hafnium compounds,Leakage currents,Optical switches,Resistance,Three-dimensional displays,Tin}, 
doi={10.1109/IEDM.2015.7409667}, 
ISSN={}, 
month={Dec},
note={(Acceptance Rate*: \underline{33\%})},
}


@INPROCEEDINGS{li2015iscas, 
author={Jing Li}, 
booktitle={2015 IEEE International Symposium on Circuits and Systems (ISCAS)}, 
title={Enabling phase-change memory for data-centric computing: Technology, circuitand system (INVITED)}, 
year={2015}, 
date={2015-05},
volume={}, 
number={}, 
pages={21--24}, 
keywords={conference, Big Data,computer centres,content-addressable storage,memory architecture,phase change memories,Big Data problems,NVM technology,PCM technology,TCAM,computing stack,cost-per-bit factor,data manipulation,data storage,data-centric computing,data-intensive applications,endurance factor,hardware features,nonvolatile memory technology,performance factor,phase-change memory,power factor,retention factor,ternary content addressable memory,Encoding,Hardware,Nonvolatile memory,Phase change materials,Phase change memory,Random access memory,Reliability,Emerging Nonvolatile Memory,PCM,TCAM,Ternary Content Addressable Memory,data-centric system,near-/in-memory computing,phase change memory}, 
doi={10.1109/ISCAS.2015.7168560}, 
ISSN={0271-4302}, 
month={May},}


@INPROCEEDINGS{li2013vlsi, 
author={Li, Jing and Montoye, Robert and Ishii, Masatoshi and Stawiasz, Kevin and Nishida, Takeshi and Maloney, Kim and Ditlow, Gary and Lewis, Scott and Maffitt, Tom and Jordan, Richard and others},
booktitle={2013 Symposium on VLSI Circuits}, 
title={{1Mb} 0.41 um^2 {2T-2R} cell nonvolatile {TCAM} with two-bit encoding and clocked self-referenced sensing (Highlight Paper of the Year)}, 
year={2013}, 
date={2013-06-12},
volume={}, 
number={}, 
pages={C104--C105}, 
keywords={conference, CMOS memory circuits,SRAM chips,clocks,content-addressable storage,integrated circuit design,integrated circuit reliability,low-power electronics,phase change memories,search problems,2-transistor-2-resistive-storage cells,2T-2R cells,CSRSS,IBM CMOS technology,PCM process,SRAM-based TCAM,bit rate 1 Mbit/s,cell nonvolatile TCAM,cell size,clocked self-referenced sensing scheme,compact cells,fabricated nonvolatile TCAM,low voltage search operation,match delay,mushroom phase-change memory process,reliable search operation,size 90 nm,technology node,test chip design,two-bit encoding,Arrays,Clocks,Encoding,Microprocessors,Phase change materials,Sensors}, 
doi={}, 
ISSN={2158-5601}, 
month={June},
abstract={This work demonstrates the first fabricated nonvolatile TCAM using 2-transistor/2-resistive-storage (2T-2R) cells to achieve >10× smaller cell size than SRAM-based TCAMs at the same technology node. The test chip was designed and fabricated in IBM 90nm CMOS technology and mushroom phase-change memory (PCM) process. To ensure reliable search operation with such compact cells, two enabling techniques were developed and implemented in hardware: 1) two-bit encoding, and 2) a clocked self-referenced sensing scheme (CSRSS). The 1Mb chip demonstrates reliable low voltage search operation (VDDmin~750mV) and a match delay of 1.9 ns under nominal operating conditions.},
note = {(Acceptance Rate: \underline{27\%}, 109 out of 396)}
}


@INPROCEEDINGS{meza2012iccd, 
author={Justin Meza and Jing Li and Onur Mutlu}, 
booktitle={2012 IEEE 30th International Conference on Computer Design (**ICCD**)}, 
title={A case for small row buffers in non-volatile main memories}, 
year={2012},
date={2012-09-30},
volume={}, 
number={}, 
pages={484--485}, 
keywords={conference, DRAM chips,buffer circuits,multiprocessing systems,DRAM baseline,DRAM chips,DRAM-based main memories,NVM technologies,array access,buffered data,chip costs,data mapping schemes,main memory dynamic energy,memory array access,memory parallelism,multicore architectures,nonvolatile main memories,read operations,row buffer size,small row buffers,system-level trends,Arrays,Memory management,Nonvolatile memory,Organizations,Phase change materials,Random access memory}, 
abstract={DRAM-based main memories have read operations that destroy the read data, and as a result, must buffer large amounts of data on each array access to keep chip costs low. Unfortunately, system-level trends such as increased memory contention in multi-core architectures and data mapping schemes that improve memory parallelism lead to only a small amount of the buffered data to be accessed. This makes buffering large amounts of data on every memory array access energy-inefficient; yet organizing DRAM chips to buffer small amounts of data is costly, as others have shown. Emerging non-volatile memories (NVMs) such as PCM, STT-RAM, and RRAM, however, do not have destructive read operations, opening up opportunities for employing small row buffers without incurring additional area penalty and/or design complexity. In this work, we discuss and evaluate architectural changes to enable small row buffers at a low cost in NVMs. We find that on a multi-core system, reducing the row buffer size can greatly reduce main memory dynamic energy compared to a DRAM baseline with large row sizes, without greatly affecting endurance, and for some NVM technologies, leads to improved performance.},
doi={10.1109/ICCD.2012.6378685}, 
ISSN={1063-6404}, 
month={Sept},
note={(Acceptance rate: \underline{25\%}, 61 out of 241)},
}

@INPROCEEDINGS{li2012irps, 
author={Jing Li and Binquan Luan and Chung Lam}, 
booktitle={2012 IEEE International Reliability Physics Symposium (**IRPS**)}, 
title={Resistance drift in phase change memory (INVITED)}, 
year={2012}, 
volume={}, 
number={}, 
pages={6C.1.1--6C.1.6}, 
keywords={conference, circuit reliability,molecular dynamics method,phase change memories,MLC PCM,SR,amorphous chalcogenide material,atomic structure,material engineering,mitigation technique,phase change memory,physics model,quantum molecular dynamic simulation,reliability issue,structural relaxation,time dependent resistance drift,Annealing,Kinetic theory,Phase change materials,Resistance,Strontium,Temperature measurement,drift,multi-level cell,phase change memory,structural relaxation}, 
doi={10.1109/IRPS.2012.6241871}, 
ISSN={1541-7026}, 
month={April},}

@INPROCEEDINGS{du2012irps, 
author={Pei-Ying Du and J. Y. Wu and T. H. Hsu and M. H. Lee and T. Y. Wang and H. Y. Cheng and E. K. Lai and S. C. Lai and H. L. Lung and S. Kim and M. J. BrightSky and Y. Zhu and S. Mittal and R. Cheek and S. Raoux and E. A. Joseph and A. Schrott and Jing Li and Chung Lam}, 
booktitle={2012 IEEE International Reliability Physics Symposium (**IRPS**)}, 
title={The impact of melting during reset operation on the reliability of phase change memory}, 
year={2012}, 
volume={}, 
number={}, 
pages={6C.2.1--6C.2.6}, 
keywords={conference, arrays,circuit reliability,electromigration,melting,phase change memories,segregation,GST-based phase change memory,RESET melting healing effect,SET induced damage,SET operation,control circuits,electromigration,large test chips,operation impact,phase change memory reliability,phase segregation,reset operation,Conductivity,Electromigration,Maintenance engineering,Phase change materials,Phase change memory,Resistance,Tin,Endurance,RESET operation,electromigration,melting,phasechange memory (PCM),reliability,segregation}, doi={10.1109/IRPS.2012.6241872}, 
ISSN={1541-7026}, 
month={April},}

@INPROCEEDINGS{raoux2011nvmts, 
author={Simone Raoux and Huai-Yu Cheng and Jury Sandrini and Jing Li and Jean Jordan-Sweet}, 
booktitle={2011 11th Annual Non-Volatile Memory Technology Symposium Proceeding (NVMTS)}, 
title={Materials engineering for Phase Change Random Access Memory}, 
year={2011}, 
volume={}, 
number={}, 
pages={1--5}, 
keywords={conference, X-ray diffraction,antimony alloys,crystallisation,germanium alloys,phase change materials,phase change memories,tellurium alloys,GeSbTe,amorphous phase,crystallization temperature,electrical contrast,materials ewngineering,phase change random access memory,rhombohedral phase,temperature 200 degC,time resolved X-ray diffraction,Phase Change Materials,Phase Change Random Access Memory}, 
doi={10.1109/NVMTS.2011.6137090}, 
ISSN={}, 
month={Nov},}

@INPROCEEDINGS{kim2012vlsitsa, 
author={S. Kim and P. Y. Du and Jing Li and M. Breitwisch and Y. Zhu and S. Mittal and R. Cheek and T. H. Hsu and M. H. Lee and A. Schrott and S. Raoux and H. Y. Cheng and S. C. Lai and J. Y. Wu and T. Y. Wang and E. A. Joseph and E. K. Lai and A. Ray and H. L. Lung and C. Lam}, 
booktitle={Proceedings of Technical Program of 2012 VLSI Technology, System and Application (**VLSI-TSA**)}, 
title={Optimization of programming current on endurance of phase change memory}, 
year={2012}, 
volume={}, 
number={}, 
pages={1--2}, 
keywords={conference, failure analysis,phase change memories,RESET current margin,endurance cycles,endurance failure modes,material segregation effect,open failure,optimization,phase change memory,phase-dependent open-failure mechanisms,programming conditions,programming current,stuck-SET failure characteristic curves,Current density,Optimization,Phase change materials,Phase change memory,Programming,Resistance}, 
doi={10.1109/VLSI-TSA.2012.6210122}, 
ISSN={1524-766X}, 
month={April},}

@INPROCEEDINGS{li2011iedm, 
author={Jing Li and Binquan Luan and T. H. Hsu and Y. Zhu and G. Martyna and D. Newns and H. Y. Cheng and S. Raoux and H. L. Lung and C. Lam}, 
booktitle={2011 International Electron Devices Meeting (**IEDM**)}, 
title={Explore physical origins of resistance drift in phase change memory and its implication for drift-insensitive materials}, 
year={2011}, 
volume={}, 
number={}, 
pages={12.5.1--12.5.4}, 
keywords={conference, amorphous semiconductors,antimony alloys,atomic structure,germanium alloys,phase change materials,phase change memories,tellurium alloys,Ge,Sb,Te,amorphous germanium,atomic structure,drift-insensitive phase change material,electrical characteristics,first principle ab initio method,material-device characterization,phase change memory,resistance drift,tellurium ternary alloys,Conductivity,Phase change materials,Phase change memory,Programming,Resistance,Temperature measurement}, 
doi={10.1109/IEDM.2011.6131541}, 
ISSN={0163-1918}, 
month={Dec},
note={(Acceptance Rate*: \underline{33\%})},
}

@INPROCEEDINGS{wu2011iedm, 
author={J. Y. Wu and M. Breitwisch and S. Kim and T. H. Hsu and R. Cheek and P. Y. Du and Jing Li and E. K. Lai and Y. Zhu and T. Y. Wang and H. Y. Cheng and A. Schrott and E. A. Joseph and R. Dasaka and S. Raoux and M. H. Lee and H. L. Lung and C. Lam}, 
booktitle={2011 International Electron Devices Meeting (**IEDM**)}, 
title={A low power phase change memory using thermally confined {TaN/TiN} bottom electrode}, 
year={2011}, 
volume={}, 
number={}, 
pages={3.2.1--3.2.4}, 
keywords={conference, conductors (electric),electrodes,heat losses,integrated circuit reliability,low-power electronics,phase change memories,tantalum compounds,thermal insulation,titanium compounds,TaN-TiN,current 30 muA,electrical conductivity,electrothermal simulation,low power PCM,low power phase change memory,size 1.5 nm,size 39 nm,storage capacity 256 Mbit,thermal barrier,thermal insulation,thermally confined bottom electrode,Electrodes,Heating,Phase change memory,Solids,Thermal resistance,Tin}, 
doi={10.1109/IEDM.2011.6131479}, 
ISSN={0163-1918}, 
month={Dec},
note={(Acceptance Rate*: \underline{33\%})},
}


@INPROCEEDINGS{wen2011esscirc, 
author={ Cheng-Yuan Wen and Jeyanandh Paramesh and Larry Pileggi and Jing Li and SangBum Kim and Jonathan Proesel and Chung Lam}, 
booktitle={2011 Proceedings of the ESSCIRC (**ESSCIRC**)}, 
title={Post-silicon calibration of analog {CMOS} using phase-change memory cells}, 
year={2011}, 
volume={}, 
number={}, 
pages={423--426}, 
keywords={conference, CMOS analogue integrated circuits,antimony compounds,calibration,chalcogenide glasses,comparators (circuits),elemental semiconductors,germanium compounds,phase change memories,redundancy,silicon,tellurium compounds,Ge2Sb2Te5,IBM CMOS technology,PCRAM mushroom cells,Si,analog CMOS,capacitance 4.41 fF,combinatorial redundancy,digital calibration,embedded GST,nonvolatile phase-change random access memory cells,offset-minimized CMOS comparator,post-manufacturing calibration,post-silicon calibration,power 55.42 muW,size 90 nm,switchable resistances,voltage 1 V,Arrays,CMOS integrated circuits,Calibration,Generators,Phase change random access memory,Redundancy,Resistance}, 
doi={10.1109/ESSCIRC.2011.6044997}, 
ISSN={1930-8833}, 
month={Sept},
note={(Acceptance Rate: \underline{38\%}, 121 out of 314)},
}

@INPROCEEDINGS{wen2011vlsi, 
author={C. Y. Wen and Jing Li and S. Kim and M. Breitwisch and C. Lam and J. Paramesh and L. T. Pileggi}, 
booktitle={2011 Symposium on VLSI Circuits - Digest of Technical Papers}, 
title={A non-volatile look-up table design using {PCM} (phase-change memory) cells}, 
year={2011}, 
date={2011-06-15},
volume={}, 
number={}, 
pages={302--303}, 
keywords={conference, CMOS memory circuits,antimony compounds,chalcogenide glasses,germanium compounds,logic circuits,phase change memories,programmable circuits,random-access storage,tellurium compounds,CMOS technology,Ge2Sb2Te5,PCM mushroom cell,digital look-up table circuit,nonvolatile logic functions,nonvolatile look-up table design,phase-change memory,programmable logic functions,resistance transformation ratio,size 90 nm,voltage 1 V,CMOS integrated circuits,Logic gates,Phase change materials,Phase change random access memory,Resistance,Table lookup}, 
doi={}, 
ISSN={2158-5636}, 
month={June},
note={(Acceptance Rate: \underline{28\%}, 115 out of 409)},
}

@INPROCEEDINGS{li2011imw, 
author={Jing Li and C. I. Wu and S. C. Lewis and J. Morrish and T. Y. Wang and R. Jordan and T. Maffitt and M. Breitwisch and A. Schrott and R. Cheek and H. L. Lung and C. Lam}, 
booktitle={2011 3rd IEEE International Memory Workshop (**IMW**)}, 
title={A Novel Reconfigurable Sensing Scheme for Variable Level Storage in Phase Change Memory}, 
year={2011}, 
volume={}, 
number={}, 
pages={1--4}, 
keywords={conference, CMOS digital integrated circuits,NAND circuits,flash memories,phase change memories,2Mcell PCM chip,CMOS technology,NAND flash,analog resistance levels,frequency 50 MHz,phase change memory,reconfigurable sensing scheme,size 90 nm,time 35 mus to 50 mus,time 5 mus,variable level storage,word length 8 bit,Clocks,Electrical resistance measurement,Flash memory,Phase change materials,Radiation detectors,Resistance}, 
doi={10.1109/IMW.2011.5873227}, 
ISSN={2159-483X}, 
month={May},}

@INPROCEEDINGS{rajendran2011imw, 
author={B. Rajendran and R. W. Cheek and L. A. Lastras and M. M. Franceschini and M. J. Breitwisch and A. G. Schrott and Jing Li and R. K. Montoye and L. Chang and C. Lam}, 
booktitle={2011 3rd IEEE International Memory Workshop (**IMW**)}, 
title={Demonstration of {CAM} and {TCAM} Using Phase Change Devices}, 
year={2011}, 
volume={}, 
number={}, 
pages={1--4}, 
keywords={conference, Monte Carlo methods,content-addressable storage,phase change memories,Monte-Carlo simulation,PCM decives,SRAM,TCAM,content addressable memory,phase change devices,phase change memory technology,ternary CAM,Arrays,Computer aided manufacturing,FETs,Phase change materials,Programming,Resistance,Resistors}, 
doi={10.1109/IMW.2011.5873229}, 
ISSN={2159-483X}, 
month={May},}

@INPROCEEDINGS{zhang2009iedm, 
author={Xiao Zhang and Jing Li and M. Grubbs and M. Deal and B. Magyari-Köpe and B. M. Clemens and Y. Nishi}, 
booktitle={2009 IEEE International Electron Devices Meeting (**IEDM**)}, 
title={Physical model of the impact of metal grain work function variability on emerging dual metal gate {MOSFETs} and its implication for {SRAM} reliability}, 
year={2009}, 
date={2009-12},
volume={}, 
number={}, 
pages={1--4}, 
keywords={conference, MOS integrated circuits,MOSFET,SRAM chips,integrated circuit metallisation,integrated circuit reliability,work function,SRAM reliability,dual metal gate MOSFET,grain orientation difference,metal grain work function variability,polycrystalline metal gate,size 22 nm,Charge carrier density,Circuit analysis,Electrodes,Fluctuations,High K dielectric materials,MOSFETs,Predictive models,Random access memory,Resource description framework,Semiconductor process modeling}, 
doi={10.1109/IEDM.2009.5424420}, 
ISSN={0163-1918}, 
month={Dec},
note={(Acceptance Rate*: \underline{33\%})},
}

@inproceedings{li2009dac,
 author = {Li, Jing and Roy, Kaushik},
 title = {Robust Heterogeneous System Design in Spintronics: Error Resilient Spin Torque {MRAM} ({STT MRAM}) Design},
 booktitle = {the 46th Annual Design Automation Conference PHD Forum},
 series = {**DAC** '09},
 year = {2009},
 keywords = {conference},
 note = {(Acceptance Rate: \underline{22\%}, 148 out of 684)},
} 


@inproceedings{li2009aspdac,
 author = {Li, Jing and Ndai, Patrick and Goel, Ashish and Liu, Haixin and Roy, Kaushik},
 title = {An Alternate Design Paradigm for Robust Spin-torque Transfer Magnetic {RAM} ({STT MRAM}) from Circuit/Architecture Perspective},
 booktitle = {Proceedings of the 2009 Asia and South Pacific Design Automation Conference},
 series = {**ASP-DAC** '09},
 year = {2009},
 month={Jan},
 date={2009-01-19},
 isbn = {978-1-4244-2748-2},
 location = {Yokohama, Japan},
 pages = {841--846},
 numpages = {6},
 url = {http://dl.acm.org/citation.cfm?id=1509633.1509820},
 doi = {10.1109/ASPDAC.2009.4796585},
 acmid = {1509820},
 publisher = {IEEE Press},
 address = {Piscataway, NJ, USA},
 keywords = {conference, stt mram},
 abstract={Spin-Torque Transfer Magnetic RAM (STT MRAM) is a promising candidate for future embedded applications. It provides desirable memory attributes such as fast access time, low cost, high density and non-volatility. However, variations in process parameters can lead to a large number of cells to fail, severely affecting the yield of the memory array. In this paper, we provide a thorough analysis of the impact of design parameters on parametric failures due to process variations. To achieve high memory yield without incurring expensive technology modification, we developed an alternate design paradigm ---circuit/architecture co-design --- to take advantage of different levels of design hierarchy (circuit and architecture) to improve the yield and memory density. The technique decouples the conflicting design requirements for read stability/writability and density. Consequently, the memory cell failure probability reduces by 48\% and cell area reduces by 21\% with negligible performance degradation (~0.4\%).},
 note = {(Acceptance Rate: \underline{33\%}, 116 out of 355)},
} 

@inproceedings{li2009gsrc,
 author = {Jing  Li and  Patrick  Ndai and Goel  Ashish and  Kaushik  Roy},
 title = {Variation  Resilient  Spin  Torque  Transfer  {MRAM} (poster)},
 booktitle = {GSRC Workshop},
 year = {2009},
 month={Mar},
 location = {Dallas, TX, USA},
 keywords = {conference},
} 


@inproceedings{li2008techcon,
 author = {Jing  Li and  Kaushik  Roy},
 title = {Modeling of Failure Probability and Statistical Design of Spin-Torque Transfer Magnetic {RAM} ({STT MRAM}) Array for Yield Enhancement},
 booktitle = {SRC Technology and Talent for the 21st Century Technology (TECHCON)},
 year = {2008},
 keywords = {conference},
} 


@INPROCEEDINGS{li2008cicc, 
author={Jing Li and Haixin Liu and S. Salahuddin and Kaushik Roy}, 
booktitle={2008 IEEE Custom Integrated Circuits Conference (**CICC**)}, 
title={Variation-tolerant Spin-Torque Transfer ({STT}) {MRAM} array for yield enhancement}, 
year={2008}, 
date={2008-09-21},
volume={}, 
number={}, 
pages={193--196}, 
keywords={conference, Green's function methods,MRAM devices,DRAM,SRAM,flash memories,nonequilibrium Green's function,optimization,variation-tolerant spin-torque transfer MRAM array,yield enhancement,Circuit simulation,Circuit stability,Circuit synthesis,Electrodes,Green's function methods,Magnetic tunneling,Random access memory,Read-write memory,Robust stability,Scalability}, 
doi={10.1109/CICC.2008.4672056}, 
ISSN={0886-5930}, 
month={Sept},}


@INPROCEEDINGS{li2008dac, 
author={Jing Li and Charles Augustine and Sayeef Salahuddin and Kaushik Roy}, 
booktitle={2008 45th ACM/IEEE Design Automation Conference (**DAC**)}, 
title={Modeling of failure probability and statistical design of Spin-Torque Transfer Magnetic Random Access Memory ({STT MRAM}) array for yield enhancement}, 
year={2008}, 
date={2008-06-08},
volume={}, 
number={}, 
pages={278--283}, 
keywords={conference, failure analysis,magnetic storage,magnetoelectronics,optimisation,random-access storage,coupled electromagnetic dynamics,failure probability,on-chip embedded memories,spin-torque transfer magnetic random access memory,spintronic device,statistical optimization methodology,yield enhancement,Couplings,Failure analysis,Flash memory,Magnetic analysis,Magnetic devices,Predictive models,Probability,Random access memory,Read-write memory,Scalability,STT MRAM,Yield}, 
doi={10.1145/1391469.1391540}, 
ISSN={0738-100X}, 
month={June},
note = {(Acceptance Rate: \underline{23\%}, 147 out of 639)},
}


@INPROCEEDINGS{li2007itc, 
author={Jing Li and S. Ghosh and Kaushik Roy}, 
booktitle={2007 IEEE International Test Conference (**ITC**)}, 
title={A generic and reconfigurable test paradigm using Low-cost integrated {Poly-Si TFTs}}, 
year={2007}, 
volume={}, 
number={}, 
pages={1--10}, 
keywords={conference, VLSI,built-in self test,design for testability,elemental semiconductors,integrated circuit testing,silicon,thin film transistors,3-D technology,BIST components,Si,VLSI systems,configurable design-for-test units,generic test structure,low-cost low-temperature integrated poly-silicon TFT,process tolerant test structure,reconfigurable test structure,thin film transistors,Circuit testing,Costs,Crystallization,Design for testability,Silicon,Substrates,System testing,Temperature,Thin film transistors,Very large scale integration}, 
doi={10.1109/TEST.2007.4437622}, 
ISSN={1089-3539}, 
month={Oct},}


@INPROCEEDINGS{li2007islped, 
author={Yiran Chen and Hai Li and Jing Li and Cheng-Kok Koh}, 
booktitle={2007 ACM/IEEE International Symposium on Low Power Electronics and Design (**ISLPED**)}, 
title={Variable-latency adder ({VL-adder}): new arithmetic circuit design practice to overcome {NBTI}}, 
year={2007}, 
volume={}, 
number={}, 
pages={195--200}, 
keywords={conference, MOSFET,adders,logic design,low-power electronics,NBTI-induced delay degradation,NBTI-tolerant techniques,VL-adder,arithmetic circuit design,clock edge,energy efficiency,lower-power adder designs,manufacturing costs,nanoscale PMOS transistors,negative bias temperature instability,variable-latency adder technique,Adders,Arithmetic,Circuit synthesis,Clocks,Degradation,Delay,MOSFETs,Negative bias temperature instability,Niobium compounds,Titanium compounds,negative bias temperature instability (NBTI),variable-latency adder (VL-adder)}, 
doi={10.1145/1283780.1283822}, 
ISSN={}, 
month={Aug},
note = {(Acceptance Rate: \underline{39\%}, 74 out of 192)},
}

@inproceedings{li2007techcon,
 author = {Jing  Li and  Kaushik  Roy},
 title = {Low Power and Variation Tolerant Digital Circuit Design in Sub-micron  Regime  using  Low  Cost {LTPS TFTs}},
 booktitle = {SRC Technology and Talent for the 21st Century Technology (TECHCON)},
 year = {2007},
 keywords = {conference},
} 


@INPROCEEDINGS{li2007icicdt, 
author={Jing Li and Kunhyuk Kang and Kaushik Roy}, 
booktitle={2007 IEEE International Conference on Integrated Circuit Design and Technology (**ICICDT**)}, 
title={Novel Variation-Aware Circuit Design of Scaled {LTPS TFT} for Ultra low Power, Low-Cost Applications}, 
year={2007}, 
volume={}, 
number={}, 
pages={1--4}, 
keywords={conference, digital integrated circuits,elemental semiconductors,flexible electronics,grain boundaries,integrated circuit design,low-power electronics,response surface methodology,silicon,thin film transistors,Si,battery-operated portable electronics,defect grain boundary region,device-to-device variation,flexible substrate,low-cost digital design,low-temperature polycrystalline silicon thin film transistors,multifinger parallel structure,power dissipation,response surface method,scaled LTPS TFT,size 200 nm,statistical variation,variation-aware circuit design,voltage 10 V to 20 V,Circuit synthesis,Digital circuits,Flexible printed circuits,Glass,Grain boundaries,Polymers,Silicon,Substrates,Temperature,Thin film transistors,Low-temperature polycrystalline-Silicon (LTPS),Response Surface Method (RSM),grain boundary (GB),thin film transistor (TFT)}, 
doi={10.1109/ICICDT.2007.4299589}, 
ISSN={2381-3555}, 
month={May},}

@INPROCEEDINGS{li2007dac, 
author={Jing Li and Kunhyuk Kang and Aditya Bansal and Kaushik Roy}, 
booktitle={2007 44th ACM/IEEE Design Automation Conference (**DAC**)}, 
title={High Performance and Low Power Electronics on Flexible Substrate}, 
year={2007}, 
date={2007-06},
volume={}, 
number={}, 
pages={274--275}, 
keywords={conference, flexible electronics,low-power electronics,semiconductor device models,silicon,substrates,thin film transistors,GB-tolerant design,flexible substrate,grain boundaries,low power electronics,polycrystalline silicon thin film transistor,ultra low power digital application,Design methodology,Design optimization,Displays,Electron traps,Grain boundaries,Low power electronics,Silicon,Substrates,Temperature,Thin film transistors,Design,Experimentation,Grain Boundary (GB),Thin Film Transistor (TFT)}, 
doi={10.1145/1278480.1278550}, 
ISSN={0738-100X}, 
month={June},
note = {(Acceptance Rate*: \underline{13\%})},
}

@INPROCEEDINGS{li2006drc, 
author={Li, Jing and Bansal, Aditya and Roy, Kaushik}, 
booktitle={2006 64th Device Research Conference (**DRC**)}, 
title={Exploring Low Temperature {Poly-Si} for Low Cost and Low Power Sub-micron Digital Operation}, 
year={2006}, 
volume={}, 
number={}, 
pages={61--62}, 
keywords={conference, Costs,Crystallization,Dielectric substrates,Digital circuits,Fabrication,Grain boundaries,Grain size,Silicon,Temperature,Thin film transistors}, 
doi={10.1109/DRC.2006.305118}, 
ISSN={1548-3770}, 
month={June},
}

%%%%%%%%%% Thesis %%%%%%%%%%
@Phdthesis{li2009phd,
  Title                    = {Robust and Energy-efficient Heterogeneous System Design in Emerging Technologies (Best Thesis Award nominee},
  Author                   = {Li, Jing},
  Institution              = {Purdue University},
  Year                     = {2009},

  School                   = {Electrical and Computer Engineering},
  note = {Advisor: Prof. Kaushik Roy},
  keywords = {phd}
}

%%%%%%%%%% Techncal report %%%%%%%%%%
@techreport{meza:2012:report:nvm,
  title = {Evaluating Row Buffer Locality in Future Non-Volatile Main Memories},
  author = {Justin Meza and Jing Li and Onur Mutlu},
  institution = {Carnegie Mellon University (CMU)},
  year = {2012},
  month = {Dec},
  number = {2012-002},
  note = {SAFARI Technical Report},
  keywords = {techreport}
}

@techreport{li:2008:report:edram,
  title = {Body History Study on {12S} {eDRAM} Sensing Operation},
  author = {Jing Li},
  institution = {Semiconductor Research and Development Center (SRDC), IBM},
  address = {Fishkill},
  year = {2008},
  keywords = {techreport}
}