Sophie

Sophie

distrib > Mageia > 7 > aarch64 > by-pkgid > 7e647d9940d31b34c253e6f71c416c4b > files > 2708

bzr-2.7.0-6.mga7.aarch64.rpm


<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN"
  "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">

<html xmlns="http://www.w3.org/1999/xhtml">
  <head>
    <meta http-equiv="X-UA-Compatible" content="IE=Edge" />
    <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
    <title>1   Bazaar Performance Roadmap &#8212; Bazaar 2.7.0 documentation</title>
    <link rel="stylesheet" href="_static/classic.css" type="text/css" />
    <link rel="stylesheet" href="_static/pygments.css" type="text/css" />
    
    <script type="text/javascript" id="documentation_options" data-url_root="./" src="_static/documentation_options.js"></script>
    <script type="text/javascript" src="_static/jquery.js"></script>
    <script type="text/javascript" src="_static/underscore.js"></script>
    <script type="text/javascript" src="_static/doctools.js"></script>
    <script type="text/javascript" src="_static/language_data.js"></script>
    
    <link rel="shortcut icon" href="_static/bzr.ico"/>

    <link rel="search" title="Search" href="search.html" />
    <link rel="next" title="Simplifying Bazaar Configuration" href="new-config-rationale.html" />
    <link rel="prev" title="Plans" href="plans.html" />
<link rel="stylesheet" href="_static/bzr-doc.css" type="text/css" />
 
  </head><body>
    <div class="related" role="navigation" aria-label="related navigation">
      <h3>Navigation</h3>
      <ul>
        <li class="right" style="margin-right: 10px">
          <a href="new-config-rationale.html" title="Simplifying Bazaar Configuration"
             accesskey="N">next</a></li>
        <li class="right" >
          <a href="plans.html" title="Plans"
             accesskey="P">previous</a> |</li>
<li><a href="http://bazaar.canonical.com/">
    <img src="_static/bzr icon 16.png" /> Home</a>&nbsp;|&nbsp;</li>
<a href="http://doc.bazaar.canonical.com/en/">Documentation</a>&nbsp;|&nbsp;</li>

        <li class="nav-item nav-item-0"><a href="index.html">Developer Document Catalog (2.7.0)</a> &#187;</li>

          <li class="nav-item nav-item-1"><a href="plans.html" accesskey="U">Plans</a> &#187;</li> 
      </ul>
    </div>  

    <div class="document">
      <div class="documentwrapper">
        <div class="bodywrapper">
          <div class="body" role="main">
            
  <div class="section" id="bazaar-performance-roadmap">
<h1><a class="toc-backref" href="#id13">1&nbsp;&nbsp;&nbsp;Bazaar Performance Roadmap</a><a class="headerlink" href="#bazaar-performance-roadmap" title="Permalink to this headline">¶</a></h1>
<div class="contents topic" id="contents">
<p class="topic-title first">Contents</p>
<ul class="auto-toc simple">
<li><a class="reference internal" href="#bazaar-performance-roadmap" id="id13">1&nbsp;&nbsp;&nbsp;Bazaar Performance Roadmap</a><ul class="auto-toc">
<li><a class="reference internal" href="#about-the-performance-roadmap" id="id14">1.1&nbsp;&nbsp;&nbsp;About the performance roadmap</a><ul class="auto-toc">
<li><a class="reference internal" href="#what-should-be-in-the-roadmap" id="id15">1.1.1&nbsp;&nbsp;&nbsp;What should be in the roadmap?</a></li>
<li><a class="reference internal" href="#what-should-the-final-system-look-like-how-is-it-different-to-what-we-have-today" id="id16">1.1.2&nbsp;&nbsp;&nbsp;What should the final system look like, how is it different to what we have today?</a></li>
<li><a class="reference internal" href="#what-use-cases-should-be-covered" id="id17">1.1.3&nbsp;&nbsp;&nbsp;What use cases should be covered?</a></li>
<li><a class="reference internal" href="#how-is-development-on-the-roadmap-coordinated" id="id18">1.1.4&nbsp;&nbsp;&nbsp;How is development on the roadmap coordinated?</a></li>
<li><a class="reference internal" href="#planned-changes-to-the-bzr-core" id="id19">1.1.5&nbsp;&nbsp;&nbsp;Planned changes to the bzr core</a><ul class="auto-toc">
<li><a class="reference internal" href="#library-changes" id="id20">1.1.5.1&nbsp;&nbsp;&nbsp;Library changes</a></li>
<li><a class="reference internal" href="#interoperable-disk-changes" id="id21">1.1.5.2&nbsp;&nbsp;&nbsp;Interoperable disk changes</a></li>
<li><a class="reference internal" href="#possibly-non-interoperable-disk-changes" id="id22">1.1.5.3&nbsp;&nbsp;&nbsp;Possibly non-interoperable disk changes</a></li>
<li><a class="reference internal" href="#non-interoperable-disk-changes" id="id23">1.1.5.4&nbsp;&nbsp;&nbsp;Non-interoperable disk changes</a></li>
</ul>
</li>
<li><a class="reference internal" href="#integration-of-performance-changes" id="id24">1.1.6&nbsp;&nbsp;&nbsp;Integration of performance changes</a></li>
</ul>
</li>
<li><a class="reference internal" href="#analysis-of-use-cases" id="id25">1.2&nbsp;&nbsp;&nbsp;Analysis of use cases</a><ul class="auto-toc">
<li><a class="reference internal" href="#analysing-a-specific-use-case" id="id26">1.2.1&nbsp;&nbsp;&nbsp;Analysing a specific use case</a></li>
<li><a class="reference internal" href="#performing-the-analysis" id="id27">1.2.2&nbsp;&nbsp;&nbsp;Performing the analysis</a></li>
<li><a class="reference internal" href="#what-factors-should-be-considered" id="id28">1.2.3&nbsp;&nbsp;&nbsp;What factors should be considered?</a></li>
</ul>
</li>
<li><a class="reference internal" href="#use-cases" id="id29">1.3&nbsp;&nbsp;&nbsp;Use cases</a><ul class="auto-toc">
<li><a class="reference internal" href="#initial-push-pull" id="id30">1.3.1&nbsp;&nbsp;&nbsp;Initial push / pull</a><ul class="auto-toc">
<li><a class="reference internal" href="#optimal-case" id="id31">1.3.1.1&nbsp;&nbsp;&nbsp;Optimal case</a></li>
<li><a class="reference internal" href="#disk-case" id="id32">1.3.1.2&nbsp;&nbsp;&nbsp;Disk case</a></li>
<li><a class="reference internal" href="#smart-network-case" id="id33">1.3.1.3&nbsp;&nbsp;&nbsp;Smart Network Case</a><ul class="auto-toc">
<li><a class="reference internal" href="#phase-1" id="id34">1.3.1.3.1&nbsp;&nbsp;&nbsp;Phase 1</a></li>
<li><a class="reference internal" href="#phase-2" id="id35">1.3.1.3.2&nbsp;&nbsp;&nbsp;Phase 2</a></li>
</ul>
</li>
<li><a class="reference internal" href="#dumb-network-case" id="id36">1.3.1.4&nbsp;&nbsp;&nbsp;Dumb Network Case</a></li>
<li><a class="reference internal" href="#wants" id="id37">1.3.1.5&nbsp;&nbsp;&nbsp;Wants</a></li>
</ul>
</li>
<li><a class="reference internal" href="#incremental-push-pull" id="id38">1.3.2&nbsp;&nbsp;&nbsp;Incremental push/pull</a><ul class="auto-toc">
<li><a class="reference internal" href="#functional-requirements" id="id39">1.3.2.1&nbsp;&nbsp;&nbsp;Functional Requirements</a></li>
<li><a class="reference internal" href="#factors-which-should-add-work-for-push-pull" id="id40">1.3.2.2&nbsp;&nbsp;&nbsp;Factors which should add work for push/pull</a></li>
<li><a class="reference internal" href="#push-pull-overview" id="id41">1.3.2.3&nbsp;&nbsp;&nbsp;Push/pull overview</a><ul class="auto-toc">
<li><a class="reference internal" href="#new-data-identification" id="id42">1.3.2.3.1&nbsp;&nbsp;&nbsp;New data identification</a><ul class="auto-toc">
<li><a class="reference internal" href="#set-synchronisation-approaches" id="id43">1.3.2.3.1.1&nbsp;&nbsp;&nbsp;Set synchronisation approaches</a></li>
<li><a class="reference internal" href="#dag-synchronisation-approaches" id="id44">1.3.2.3.1.2&nbsp;&nbsp;&nbsp;DAG synchronisation approaches</a></li>
<li><a class="reference internal" href="#file-level-scaling" id="id45">1.3.2.3.1.3&nbsp;&nbsp;&nbsp;File level scaling</a></li>
<li><a class="reference internal" href="#api-scaling" id="id46">1.3.2.3.1.4&nbsp;&nbsp;&nbsp;API scaling</a></li>
</ul>
</li>
<li><a class="reference internal" href="#data-reading" id="id47">1.3.2.3.2&nbsp;&nbsp;&nbsp;Data reading</a><ul class="auto-toc">
<li><a class="reference internal" href="#id1" id="id48">1.3.2.3.2.1&nbsp;&nbsp;&nbsp;File level scaling</a></li>
<li><a class="reference internal" href="#id2" id="id49">1.3.2.3.2.2&nbsp;&nbsp;&nbsp;API scaling</a></li>
</ul>
</li>
<li><a class="reference internal" href="#data-verification-and-writing" id="id50">1.3.2.3.3&nbsp;&nbsp;&nbsp;Data Verification and writing</a><ul class="auto-toc">
<li><a class="reference internal" href="#overview-summary" id="id51">1.3.2.3.3.1&nbsp;&nbsp;&nbsp;Overview summary</a></li>
<li><a class="reference internal" href="#id3" id="id52">1.3.2.3.3.2&nbsp;&nbsp;&nbsp;File level scaling</a></li>
<li><a class="reference internal" href="#id4" id="id53">1.3.2.3.3.3&nbsp;&nbsp;&nbsp;API scaling</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#notes-from-london" id="id54">1.3.2.4&nbsp;&nbsp;&nbsp;Notes from London</a></li>
</ul>
</li>
<li><a class="reference internal" href="#add" id="id55">1.3.3&nbsp;&nbsp;&nbsp;Add</a><ul class="auto-toc">
<li><a class="reference internal" href="#least-work-we-can-hope-to-perform" id="id56">1.3.3.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a></li>
<li><a class="reference internal" href="#per-file-algorithm" id="id57">1.3.3.2&nbsp;&nbsp;&nbsp;Per file algorithm</a></li>
</ul>
</li>
<li><a class="reference internal" href="#commit-performance-notes" id="id58">1.3.4&nbsp;&nbsp;&nbsp;Commit Performance Notes</a><ul class="auto-toc">
<li><a class="reference internal" href="#changes-to-commit" id="id59">1.3.4.1&nbsp;&nbsp;&nbsp;Changes to commit</a></li>
<li><a class="reference internal" href="#commit-the-minimum-work-required" id="id60">1.3.4.2&nbsp;&nbsp;&nbsp;Commit: The Minimum Work Required</a></li>
<li><a class="reference internal" href="#commit-vs-status" id="id61">1.3.4.3&nbsp;&nbsp;&nbsp;Commit vs Status</a></li>
<li><a class="reference internal" href="#avoiding-work-smarter-change-detection" id="id62">1.3.4.4&nbsp;&nbsp;&nbsp;Avoiding Work: Smarter Change Detection</a></li>
<li><a class="reference internal" href="#avoiding-work-better-layering" id="id63">1.3.4.5&nbsp;&nbsp;&nbsp;Avoiding Work: Better Layering</a></li>
<li><a class="reference internal" href="#avoiding-work-avoiding-reading-parent-data" id="id64">1.3.4.6&nbsp;&nbsp;&nbsp;Avoiding work: avoiding reading parent data</a></li>
<li><a class="reference internal" href="#code-structure" id="id65">1.3.4.7&nbsp;&nbsp;&nbsp;Code structure</a></li>
<li><a class="reference internal" href="#complications-of-commit" id="id66">1.3.4.8&nbsp;&nbsp;&nbsp;Complications of commit</a></li>
<li><a class="reference internal" href="#interface-stack" id="id67">1.3.4.9&nbsp;&nbsp;&nbsp;Interface stack</a></li>
<li><a class="reference internal" href="#branch-tree-interface" id="id68">1.3.4.10&nbsp;&nbsp;&nbsp;Branch-&gt;Tree interface</a></li>
<li><a class="reference internal" href="#information-from-the-tree-to-repository" id="id69">1.3.4.11&nbsp;&nbsp;&nbsp;Information from the tree to repository</a></li>
<li><a class="reference internal" href="#information-from-the-repository-to-the-tree" id="id70">1.3.4.12&nbsp;&nbsp;&nbsp;Information from the repository to the tree</a></li>
<li><a class="reference internal" href="#selective-commit" id="id71">1.3.4.13&nbsp;&nbsp;&nbsp;Selective commit</a></li>
<li><a class="reference internal" href="#common-commit-code" id="id72">1.3.4.14&nbsp;&nbsp;&nbsp;Common commit code</a></li>
<li><a class="reference internal" href="#order-of-traversal" id="id73">1.3.4.15&nbsp;&nbsp;&nbsp;Order of traversal</a></li>
<li><a class="reference internal" href="#open-question-per-file-graphs" id="id74">1.3.4.16&nbsp;&nbsp;&nbsp;Open question: per-file graphs</a></li>
</ul>
</li>
<li><a class="reference internal" href="#diff-performance-analysis" id="id75">1.3.5&nbsp;&nbsp;&nbsp;diff Performance Analysis</a><ul class="auto-toc">
<li><a class="reference internal" href="#minimal-work" id="id76">1.3.5.1&nbsp;&nbsp;&nbsp;Minimal Work</a><ul class="auto-toc">
<li><a class="reference internal" href="#reuse-of-historical-comparisons" id="id77">1.3.5.1.1&nbsp;&nbsp;&nbsp;Reuse of historical comparisons</a></li>
<li><a class="reference internal" href="#historical-tree-against-historical-tree" id="id78">1.3.5.1.2&nbsp;&nbsp;&nbsp;Historical Tree Against Historical Tree</a></li>
<li><a class="reference internal" href="#basis-against-historical-tree" id="id79">1.3.5.1.3&nbsp;&nbsp;&nbsp;Basis Against Historical Tree</a></li>
<li><a class="reference internal" href="#basis-against-basis" id="id80">1.3.5.1.4&nbsp;&nbsp;&nbsp;Basis Against Basis</a></li>
<li><a class="reference internal" href="#working-tree-against-basis" id="id81">1.3.5.1.5&nbsp;&nbsp;&nbsp;Working Tree Against Basis</a></li>
<li><a class="reference internal" href="#working-tree-against-historical-tree" id="id82">1.3.5.1.6&nbsp;&nbsp;&nbsp;Working Tree Against Historical Tree</a></li>
<li><a class="reference internal" href="#working-tree-against-working-tree" id="id83">1.3.5.1.7&nbsp;&nbsp;&nbsp;Working Tree Against Working Tree</a></li>
</ul>
</li>
<li><a class="reference internal" href="#api-changes" id="id84">1.3.5.2&nbsp;&nbsp;&nbsp;API Changes</a></li>
<li><a class="reference internal" href="#storage-considerations" id="id85">1.3.5.3&nbsp;&nbsp;&nbsp;Storage considerations</a></li>
</ul>
</li>
<li><a class="reference internal" href="#garbage-collection" id="id86">1.3.6&nbsp;&nbsp;&nbsp;Garbage Collection</a><ul class="auto-toc">
<li><a class="reference internal" href="#id7" id="id87">1.3.6.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a></li>
</ul>
</li>
<li><a class="reference internal" href="#revert" id="id88">1.3.7&nbsp;&nbsp;&nbsp;Revert</a><ul class="auto-toc">
<li><a class="reference internal" href="#id8" id="id89">1.3.7.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a></li>
</ul>
</li>
<li><a class="reference internal" href="#the-status-command" id="id90">1.3.8&nbsp;&nbsp;&nbsp;The status command</a><ul class="auto-toc">
<li><a class="reference internal" href="#ui-overview" id="id91">1.3.8.1&nbsp;&nbsp;&nbsp;UI Overview</a></li>
<li><a class="reference internal" href="#ideal-work-for-working-tree-to-historical-status" id="id92">1.3.8.2&nbsp;&nbsp;&nbsp;Ideal work for working tree to historical status</a></li>
<li><a class="reference internal" href="#locality-of-reference" id="id93">1.3.8.3&nbsp;&nbsp;&nbsp;Locality of reference</a></li>
<li><a class="reference internal" href="#scaling-observations" id="id94">1.3.8.4&nbsp;&nbsp;&nbsp;Scaling observations</a></li>
</ul>
</li>
<li><a class="reference internal" href="#annotate" id="id95">1.3.9&nbsp;&nbsp;&nbsp;Annotate</a></li>
<li><a class="reference internal" href="#scaling-analysys-of-merge" id="id96">1.3.10&nbsp;&nbsp;&nbsp;Scaling analysys of Merge</a><ul class="auto-toc">
<li><a class="reference internal" href="#needs" id="id97">1.3.10.1&nbsp;&nbsp;&nbsp;Needs</a></li>
<li><a class="reference internal" href="#notes" id="id98">1.3.10.2&nbsp;&nbsp;&nbsp;Notes</a></li>
</ul>
</li>
<li><a class="reference internal" href="#bundle-creation" id="id99">1.3.11&nbsp;&nbsp;&nbsp;Bundle Creation</a><ul class="auto-toc">
<li><a class="reference internal" href="#id10" id="id100">1.3.11.1&nbsp;&nbsp;&nbsp;Needs</a></li>
</ul>
</li>
<li><a class="reference internal" href="#uncommit-performance-notes" id="id101">1.3.12&nbsp;&nbsp;&nbsp;Uncommit Performance Notes</a><ul class="auto-toc">
<li><a class="reference internal" href="#specification-of-uncommit" id="id102">1.3.12.1&nbsp;&nbsp;&nbsp;Specification of uncommit</a></li>
</ul>
</li>
<li><a class="reference internal" href="#missing" id="id103">1.3.13&nbsp;&nbsp;&nbsp;Missing</a></li>
</ul>
</li>
<li><a class="reference internal" href="#subsystem-designs" id="id104">1.4&nbsp;&nbsp;&nbsp;Subsystem designs</a><ul class="auto-toc">
<li><a class="reference internal" href="#directory-fingerprints" id="id105">1.4.1&nbsp;&nbsp;&nbsp;Directory fingerprints</a><ul class="auto-toc">
<li><a class="reference internal" href="#introduction" id="id106">1.4.1.1&nbsp;&nbsp;&nbsp;Introduction</a></li>
<li><a class="reference internal" href="#use-case-oriented-apis" id="id107">1.4.1.2&nbsp;&nbsp;&nbsp;Use-case oriented APIs</a><ul class="auto-toc">
<li><a class="reference internal" href="#commit" id="id108">1.4.1.2.1&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">commit</span></code></a></li>
<li><a class="reference internal" href="#log" id="id109">1.4.1.2.2&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">log</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#open-questions" id="id110">1.4.1.3&nbsp;&nbsp;&nbsp;Open questions</a></li>
<li><a class="reference internal" href="#conclusions" id="id111">1.4.1.4&nbsp;&nbsp;&nbsp;Conclusions</a></li>
<li><a class="reference internal" href="#design-changes" id="id112">1.4.1.5&nbsp;&nbsp;&nbsp;Design changes</a></li>
<li><a class="reference internal" href="#id12" id="id113">1.4.1.6&nbsp;&nbsp;&nbsp;API changes</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>
</div>
<div class="section" id="about-the-performance-roadmap">
<h2><a class="toc-backref" href="#id14">1.1&nbsp;&nbsp;&nbsp;About the performance roadmap</a><a class="headerlink" href="#about-the-performance-roadmap" title="Permalink to this headline">¶</a></h2>
<div class="section" id="what-should-be-in-the-roadmap">
<h3><a class="toc-backref" href="#id15">1.1.1&nbsp;&nbsp;&nbsp;What should be in the roadmap?</a><a class="headerlink" href="#what-should-be-in-the-roadmap" title="Permalink to this headline">¶</a></h3>
<p>A good roadmap provides a place for contributors to look for tasks, it
provides users with a sense of when we will fix things that are
affecting them, and it also allows us all to agree about where we are
headed. So the roadmap should contain enough things to let all this
happen.</p>
<p>I think that it needs to contain the analysis work which is required, a
list of the use cases to be optimised, the disk changes required, and
the broad sense of the api changes required. It also needs to list the
inter-dependencies between these things: we should aim for a large
surface area of ‘ready to be worked on’ items, that makes it easy to
improve performance without having to work in lockstep with other
developers.</p>
<p>Clearly the analysis step is an immediate bottleneck - we cannot tell if
an optimisation for use case A is a pessimism for use case B until we
have analysed both A and B. I propose that we complete the analysis of
say a dozen core use cases end to end during the upcoming sprint in
London. We should then be able to fork() for much of the detailed design
work and regroup with disk and api changes shortly thereafter.</p>
<p>I suspect that clarity of layering will make a big difference to
developer parallelism, so another proposal I have is for us to look at
the APIs for Branch and Repository in London in the light of what we
have learnt over the last years.</p>
</div>
<div class="section" id="what-should-the-final-system-look-like-how-is-it-different-to-what-we-have-today">
<h3><a class="toc-backref" href="#id16">1.1.2&nbsp;&nbsp;&nbsp;What should the final system look like, how is it different to what we have today?</a><a class="headerlink" href="#what-should-the-final-system-look-like-how-is-it-different-to-what-we-have-today" title="Permalink to this headline">¶</a></h3>
<p>One of the things I like the most about bzr is its rich library API, and
I’ve heard this from numerous other folk. So anything that will remove
that should be considered a last resort.</p>
<p>Similarly our relatively excellent cross platform support is critical
for projects that are themselves cross platform, and thats a
considerable number these days.</p>
<p>And of course, our focus on doing the right thing is what differentiates
us from some of the other VCS’s, so we should be focusing on doing the
right thing quickly :).</p>
<p>What we have today though has grown organically in response to us
identifying bottlenecks over several iterations of back end storage,
branch metadata and the local tree representation. I think we are
largely past that and able to describe the ideal characteristics of the
major actors in the system - primarily Tree, Branch, Repository - based
on what we have learnt.</p>
</div>
<div class="section" id="what-use-cases-should-be-covered">
<h3><a class="toc-backref" href="#id17">1.1.3&nbsp;&nbsp;&nbsp;What use cases should be covered?</a><a class="headerlink" href="#what-use-cases-should-be-covered" title="Permalink to this headline">¶</a></h3>
<p>My list of use cases is probably not complete - its just the ones I
happen to see a lot :). I think each should be analysed comprehensively
so we dont need to say ‘push over the network’ - its implied in the
scaling analysis that both semantic and file operation latency will be
considered.</p>
<p>These use cases are ordered by roughly the ease of benchmarking, and the
frequency of use. This ordering is so that when people are comparing bzr
they are going to get use cases we have optimised; and so that as we
speed things up our existing users will have the things they do the most
optimised.</p>
<blockquote>
<div><ul class="simple">
<li>status tree</li>
<li>status subtree</li>
<li>commit</li>
<li>commit to a bound branch</li>
<li>incremental push/pull</li>
<li>log</li>
<li>log path</li>
<li>add</li>
<li>initial push or pull [both to a new repo and an existing repo with
different data in it]</li>
<li>diff tree</li>
<li>diff subtree</li>
<li>revert tree</li>
<li>revert subtree</li>
<li>merge from a branch</li>
<li>merge from a bundle</li>
<li>annotate</li>
<li>create a bundle against a branch</li>
<li>uncommit</li>
<li>missing</li>
<li>update</li>
<li>cbranch</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="how-is-development-on-the-roadmap-coordinated">
<h3><a class="toc-backref" href="#id18">1.1.4&nbsp;&nbsp;&nbsp;How is development on the roadmap coordinated?</a><a class="headerlink" href="#how-is-development-on-the-roadmap-coordinated" title="Permalink to this headline">¶</a></h3>
<p>I think we should hold regular get-togethers (on IRC) to coordinate on
our progress, because this is a big task and its a lot easier to start
helping out some area which is having trouble if we have kept in contact
about each areas progress. This might be weekly or fortnightly or some
such.</p>
<p>we need a shared space to record the results of the analysis and the
roadmap as we go forward. Given that we’ll need to update these as new
features are considered, I propose that we use doc/design as a working
space, and as we analyse use cases we include them in there - including
the normal review process for each patch. We also need documentation
about doing performance tuning - not the minutiae, though that is
needed, but about how to effective choose things to optimise which will
give the best return on time spent - that is what the roadmap should
help with, but this looks to be a large project and an overview will be
of great assistance I think. We want to help everyone that wishes to
contribute to performance to do so effectively.</p>
<p>Finally, it’s important to note that coding is not the only contribution
- testing, giving feedback on current performance, helping with the
analysis are all extremely important tasks too and we probably want to
have clear markers of where that should be done to encourage such
contributions.</p>
</div>
<div class="section" id="planned-changes-to-the-bzr-core">
<h3><a class="toc-backref" href="#id19">1.1.5&nbsp;&nbsp;&nbsp;Planned changes to the bzr core</a><a class="headerlink" href="#planned-changes-to-the-bzr-core" title="Permalink to this headline">¶</a></h3>
<p>Delivering the best possible performance requires changing the bzr core design
from that present in 0.16. Some of these changes are incremental and can be
done with no impact on disk format. Many of them however do require changes to
the disk format, and these can be broken into two sets of changes, those which
are sufficiently close to the model bzr uses today to interoperate with the
0.16 disk formats, and those that are not able to interoperate with the 0.16
disk formats - specifically some planned changes may result in data which
cannot be exported to bzr 0.16’s disk formats and then imported back to the new
format without losing critical information. If/when this takes place it will be
essentially a migration for users to switch from their bzr 0.16 repository to a
bzr that supports them. We plan to batch all such changes into one large
‘experimental’ repository format, which will be complete stable and usable
before we migrate it to become a supported format. Getting new versions of bzr
in widespread use at that time will be very important, otherwise the user base
may be split in two - users that have upgraded and users that have not.</p>
<p>The following changes are grouped according to their compatability impact:
library only, disk format but interoperable, disk format interoperability
unknown, and disk format, not interoperable.</p>
<div class="section" id="library-changes">
<h4><a class="toc-backref" href="#id20">1.1.5.1&nbsp;&nbsp;&nbsp;Library changes</a><a class="headerlink" href="#library-changes" title="Permalink to this headline">¶</a></h4>
<p>These changes will change bzrlib’s API but will not affect the disk format and
thus do not pose a significant migration issue.</p>
<blockquote>
<div><ul class="simple">
<li>For our 20 core use cases, we plan to add targeted API’s to bzrlib that are
repository-representation agnostic. These will instead reflect the shape of
data access most optimal for that case.</li>
<li>Deprecate ‘versioned files’ as a library concept. Instead of asking for
information about a file-over-time as a special case, we will move to an API
that assumes less coupling between the historical information and the
ability to obtain texts/deltas etc. Specifically, we need to remove all
API’s that act in terms of on disk representation except those within a
given repository implementation.</li>
<li>Create a validator for revisions that is more amenable to use by other parts
of the code base than just the gpg signing facility. This can be done today
without changing disk, possibly with a performance hit until the disk
formats match the validatory logic. It will be hard to tell if we have the
right routine for that until all the disk changes are complete, so while
this is a library only change, it’s likely one that will be delayed to near
the end of the process.</li>
<li>Add an explicit API for managing cached annotations. While annotations are
considered a cache this is not exposed in such a way that cache operations
like ‘drop the cache’ can be performed. On current disk formats the cache is
mandatory, but an API to manage would allow refreshing of the cache (e.g.
after ghosts are filled in during baz conversions).</li>
<li>Use the _iter_changes API to perform merges. This is a small change that may
remove the need to use inventories in merge, making a dramatic difference to
merge performance once the tree shape comparison optimisations are
implemented.</li>
<li>Create a network-efficient revision graph API. This is the logic at the
start of push and pull operations, which currently scales O(graph size).
Fixing the scaling can be done, but there are tradeoffs to latency and
performance to consider, making it a little tricky to get right.</li>
<li>Working tree disk operation ordering. We plan to change the order in which
some operations are done (specifically TreeTransform ones) to improve
performance. There is already a 66% performance boost in that area going
through review.</li>
<li>Stop requiring full memory copies of files. Currently bzr requires that it
can hold 3 copies of any file it’s versioning in memory. Solving this is
tricky, particularly without performance regressions on small files, but
without solving it versioning of .iso and other large objects will continue
to be extremely painful.</li>
<li>Add an API for per-file graph access that alllows incremental access and is
suitable for on-demand generation if desired.</li>
<li>Repository stacking API. Allowing multiple databases to be stacked to give a
single ‘repository’ will allow implementation of some long desired features
like history horizons, and bundle usage where the bundle is not added to the
local repository just to examine its contents.</li>
<li>Revision data manipulation API. We need a single streaming API for adding
data to or getting it from a repository. This will need to allow hints such
as ‘optimise for size’, or ‘optimise for fast-addition’ to meet the various
users planned, but it is a core part of the library today, and it’s not
sufficiently clean to let us simplify/remove a lot of related code today.</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="interoperable-disk-changes">
<h4><a class="toc-backref" href="#id21">1.1.5.2&nbsp;&nbsp;&nbsp;Interoperable disk changes</a><a class="headerlink" href="#interoperable-disk-changes" title="Permalink to this headline">¶</a></h4>
<blockquote>
<div><ul class="simple">
<li>New container format to allow single-file description of multiple named
objects. This will provide the basis for transmission of revisions over the
network, the new bundle format, and possibly a new repository format as
well. [Core implemented]</li>
<li>Separate the annotation cache from the storage of actual file texts and make
the annotation style, and when to do it, configurable. This will reduce data
sent over the wire when repositories have had ‘needs-annotations’ turned
off, which very large trees may choose to do - generating just-in-time
annotations may be desirable for those trees (even when performing
annotation based merges).</li>
<li>Repository disk operation ordering. The order that tasks access data within
the repository and the layout of the data should be harmonised. This will
require disk format changes but does not inherently alter the model, so it’s
straight forward to export from a repository that has been optimised in this
way to a 0.16 based repository.</li>
<li>Inventory representation. An inventory is a logical description of the shape
of a version controlled tree. Currently we operate on the whole inventory as
a tree broken down per directory, but we store it as a flat file. This scale
very poorly as even a minor change between inventories requires us to scan
the entire file, and in large trees this is many megabytes of data to
consider. We are investigating the exact form, but the intent is to change
the serialisation of inventories so that comparing two inventories can be
done in some smaller time - e.g. O(log N) scaling. Whatever form this takes,
a repository that can export it directly will be able to perform operations
between two historical trees much more efficiently than the current
repositories.</li>
<li>Greatest distance from origin cache. This is a possible change to introduce,
but it may be unnecessary - listed here for completeness till it has been
established as [un]needed.</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="possibly-non-interoperable-disk-changes">
<h4><a class="toc-backref" href="#id22">1.1.5.3&nbsp;&nbsp;&nbsp;Possibly non-interoperable disk changes</a><a class="headerlink" href="#possibly-non-interoperable-disk-changes" title="Permalink to this headline">¶</a></h4>
<blockquote>
<div><ul>
<li><p class="first">Removing of derivable data from the core of bzr. Much of the data that bzr
stores is derivable from the users source files. For instance the
annotations that record who introduced a line. Given the full history for a
repository we can recreate that at any time. We want to remove the
dependence of the core of bzr on any data that is derivable, because doing
this will give us the freedom to:</p>
<ul class="simple">
<li>Improve the derivation algorithm over time.</li>
<li>Deal with bugs in the derivation algorithms without having ‘corrupt
repositories’ or such things.</li>
</ul>
<p>However, some of the data that is technically derived, like the per-file
merge graph, is both considered core, and can be generated differently when
certain circumstances arive, by bzr 0.16. Any change to the ‘core’ status of
that data will discard data that cannot be recreated and thus lead to the
inability to export from a format where that is derived data to bzr 0.16’s
formats without errors occuring in those circumstances. Some of the data
that may be considered for this includes:</p>
<ul class="simple">
<li>Per file merge graphs</li>
<li>Annotations</li>
</ul>
</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="non-interoperable-disk-changes">
<h4><a class="toc-backref" href="#id23">1.1.5.4&nbsp;&nbsp;&nbsp;Non-interoperable disk changes</a><a class="headerlink" href="#non-interoperable-disk-changes" title="Permalink to this headline">¶</a></h4>
<blockquote>
<div><ul class="simple">
<li>Drop the per-file merge graph ‘cache’ currently held in the FILE-ID.kndx
files. A specific case of removing derivable data, this may allow smaller
inventory metadata and also make it easier to allow two different trees (in
terms of last-change made, e.g. if one is a working tree) to be compared
using a hash-tree style approach.</li>
<li>Use hash based names for some objects in the bzr database. Because it would force
total-knowledge-of-history on the graph revision objects will not be namable
via hash’s and neither will revisio signatures. Other than that though we
can in principle use hash’s e.g. SHA1 for everything else. There are many
unanswered questions about hash based naming related to locality of
reference impacts, which need to be answered before this becomes a definite
item.</li>
</ul>
</div></blockquote>
</div>
</div>
<div class="section" id="integration-of-performance-changes">
<h3><a class="toc-backref" href="#id24">1.1.6&nbsp;&nbsp;&nbsp;Integration of performance changes</a><a class="headerlink" href="#integration-of-performance-changes" title="Permalink to this headline">¶</a></h3>
<p>To deliver a version of bzr with all our planned changes will require
significant integration work. Minimally each change needs to integrate with
some aspect of the bzr version it’s merged into, but in reality many of these
changes while conceptually independent will in fact have to integrate with the
other changes we have planned before can have a completed system.</p>
<p>Additionally changes that alter disk formats are inherently more tricky to
integrate because we will often need to alter apis throughout the code base to
expose the increased or reduced model of the preferred disk format.</p>
<p>You can generate a graph <code class="docutils literal notranslate"><span class="pre">performance.png</span></code> in the source tree from
Graphviz “dot” file <code class="docutils literal notranslate"><span class="pre">performance.dot</span></code>.  This graphs out the dependencies
to let us make accurate assessments of the changes needed in terms of code
and API, hopefully minimising the number of different integration steps we
have to take, while giving us a broad surface area for development. It’s
based on a summary in the next section of this document of the planned
changes with their expected collaborators and dependencies. Where a
command is listed, the expectation is that all uses of that command -
local, remote, dumb transport and smart transport are being addressed
together.</p>
<p>The following provides a summary of the planned changes and their expected
collaborators within the code base, along with an estimate of whether they are
likely to require changes to their collaborators to be considered ‘finished’.</p>
<blockquote>
<div><ul class="simple">
<li>Use case target APIs: Each of these is likely to alter the Tree interface.
Some few of them focus on Branch and will alter Branch and Repository
accordingly. As they are targeted APIs we can deep changes all the way down
the stack to the underlying representation to make it all fit well.
Presenting a top level API for many things will be possible now as long as
the exposed data is audited for things we plan to make optional, or remove:
Such things cannot be present in the final API. Writing these APIs now will
provide strong feedback to the design process for those things which are
considered optional or removable, so these APIs should be implemented
before removing or making optional existing data.</li>
<li>Deprecating versioned files as a supported API: This collaborates with the
Repository API but can probably be done by adding a replacement API for
places where the versioned-file api is used. We may well want to keep a
concept of ‘a file over time’ or ‘inventories over time’, so the existing
repository model of exposing versioned file objects may be ok; what we need
to ensure we do is remove the places in the code base where you create or
remove or otherwise describe manipulation of the storage by knit rather than
talking at the level of file ids and revision ids. The current
versioned-file API would be a burden for implementors of a blob based
repository format, so the removal of callers, and deprecation of those parts
of the API should be done before creating a blob based repository format.</li>
<li>Creating a revision validator: Revision validators may depend on storage
layer changes to inventories so while we can create a revision validator
API, we cannot create the final one until we have the inventory structural
changes completed.</li>
<li>Annotation caching API: This API is a prerequisite for new repository
formats. If written after they are introduced we may find that the
repository is lacking in functionality, so the API should be implemented
first.</li>
<li>_iter_changes based merging: If the current _iter_changes_ API is
insufficient, we should know about that before designing the disk format for
generating fast _iter_changes_ output.</li>
<li>Network-efficient revision graph API: This influences what questions we will
want to ask a local repository very quickly; as such it’s a driver for the
new repository format and should be in place first if possible. It’s probably
not sufficiently different to local operations to make this a hard ordering
though.</li>
<li>Working tree disk ordering: Knowing the expected order for disk operations
may influence the needed use case specific APIs, so having a solid
understanding of what is optimal - and why - and whether it is pessimal on
non-Linux-kernel platforms is rather important.</li>
<li>Be able to version files greater than memory in size: This cannot be
achieved until all parts of the library which deal with user files are able
to provide access to files larger than memory. Many strategies can be
considered for this - such as temporary files on disk, memory mapping etc.
We should have enough of a design laid out that developers of repository and
tree logic are able to start exposing apis, and considering requirements
related to them, to let this happen.</li>
<li>Per-file graph access API: This should be implemented on top of or as part
of the newer API for accessing data about a file over time. It can be a
separate step easily; but as it’s in the same area of the library should not
be done in parallel.</li>
<li>Repository stacking API: The key dependency/change required for this is that
repositories must individually be happy with having partial data - e.g. many
ghosts. However the way the API needs to be used should be driven from the
command layer in, because it’s unclear at the moment what will work best.</li>
<li>Revision stream API: This API will become clear as we streamline commands.
On the data insertion side commit will want to generate new data. The
commands pull, bundle, merge, push, possibly uncommit will want to copy
existing data in a streaming fashion.</li>
<li>New container format: It’s hard to tell what the right way to structure the
layering is. Probably having smooth layering down to the point that code
wants to operate on the containers directly will make this more clear. As
bundles will become a read-only branch &amp; repository, the smart server wants
streaming-containers, and we are planning a pack based repository, it
appears that we will have three different direct container users. However,
the bundle user may in fact be fake - because it really is a repository.</li>
<li>Separation of annotation cache: Making the disk changes to achieve this
depends on the new API being created. Bundles probably want to be
annotation-free, so they are a form of implementation of this and will need
the on-demand annotation facility.</li>
<li>Repository operation disk ordering: Dramatically changing the ordering of
disk operations requires a new repository format. We have most of the
analysis done to be able to specify the desired ordering, so it should be
possible to write such a format now based on the container logic, but
without any of the inventory representation or delta representation changes.
This would for instance involve pack combining ordering the existing diffs
in reverse order.</li>
<li>Inventory representation: This has a dependency on what data is
dropped from the core and what is kept. Without those changes being known we
can implement a new representation, but it won’t be a final one. One of the
services the new inventory representation is expected to deliver is one of
validators for subtrees – a means of comparing just subtrees of two
inventories without comparing all the data within that subtree.</li>
<li>Delta storage optimisation: This has a strict dependency on a new repository
format. Optimisation takes many forms - we probably cannot complete the
desired optimisations under knits though we could use xdelta within a
knit-variation.</li>
<li>Greatest distance from origin cache: The potential users of this exist
today, it is likely able to be implemented immediately, but we are not sure
that its needed anymore, so it is being shelved.</li>
<li>Removing derivable data: It’s very hard to do this while the derived data is
exposed in API’s but not used by commands. Implemented the targeted API’s
for our core use cases should allow use to remove accidental use of derived
data, making only explicit uses of it visible, and isolating the impact of
removing it : allowing us to experiment sensibly. This covers both dropping
the per-file merge graph and the hash-based-names proposals.</li>
</ul>
</div></blockquote>
</div>
</div>
<div class="section" id="analysis-of-use-cases">
<h2><a class="toc-backref" href="#id25">1.2&nbsp;&nbsp;&nbsp;Analysis of use cases</a><a class="headerlink" href="#analysis-of-use-cases" title="Permalink to this headline">¶</a></h2>
<div class="section" id="analysing-a-specific-use-case">
<h3><a class="toc-backref" href="#id26">1.2.1&nbsp;&nbsp;&nbsp;Analysing a specific use case</a><a class="headerlink" href="#analysing-a-specific-use-case" title="Permalink to this headline">¶</a></h3>
<dl class="docutils">
<dt>The analysis of a use case needs to provide as outputs:</dt>
<dd><ul class="first last simple">
<li>The functional requirements that the use case has to satisfy.</li>
<li>The file level operations and access patterns that will give the best
performance.</li>
<li>A low friction API which will allow the use case to be implemented.</li>
<li>The release of bzr (and thus the supported features) for which the analysis
was performed. The feature set of bzr defines the access patterns and data
required to implement any use case. So when we add features, their design
changes the requirements for the parts of the system they alter, so we need
to re-analyse use cases when bzr’s feature set changes. If future plans are
considered in the analysis with the intention of avoiding rework, these
should also be mentioned.</li>
</ul>
</dd>
</dl>
</div>
<div class="section" id="performing-the-analysis">
<h3><a class="toc-backref" href="#id27">1.2.2&nbsp;&nbsp;&nbsp;Performing the analysis</a><a class="headerlink" href="#performing-the-analysis" title="Permalink to this headline">¶</a></h3>
<p>The analysis needs to be able to define the characteristics of the
involved disk storage and APIs. That means we need to examine the data
required for the operation, in what order it is required, on both the
read and write sides, and how that needs to be presented to be
consistent with our layering.</p>
<p>As a quick example: ‘annotation of a file requires the file id looked up
from the tree, the basis revision id from the tree, and then the text of
that fileid-revisionid pair along with the creating revision id
allocated to each line, and the dotted revision number of each of those
revision ids.’ All three of our key domain objects are involved here,
but we haven’t defined any characteristics of the api or disk facilities
yet. We could then do that by saying something like ‘the file-id lookup
should degrade gracefully as trees become huge. The tree basis id should
be constant time. Retrieval of the annotated text should be roughly
constant for any text of the same size regardless of the number of
revisions contributing to its content. Mapping of the revision ids to
dotted revnos could be done as the text is retrieved, but it’s completely
fine to post-process the annotated text to obtain dotted-revnos.’</p>
</div>
<div class="section" id="what-factors-should-be-considered">
<h3><a class="toc-backref" href="#id28">1.2.3&nbsp;&nbsp;&nbsp;What factors should be considered?</a><a class="headerlink" href="#what-factors-should-be-considered" title="Permalink to this headline">¶</a></h3>
<p>Obviously, those that will make for an extremely fast system :). There
are many possible factors, but the ones I think are most interesting to
design with are:</p>
<ul>
<li><p class="first">baseline overhead:</p>
<blockquote>
<div><ul class="simple">
<li>The time to get bzr ready to begin the use case.</li>
</ul>
</div></blockquote>
</li>
<li><p class="first">scaling: how does performance change when any of the follow aspects
of the system are ratcheted massively up or down:</p>
<blockquote>
<div><ul class="simple">
<li>number of files/dirs/symlinks/subtrees in a tree (both working and
revision trees)</li>
<li>size of any particular file</li>
<li>number of elements within a single directory</li>
<li>length of symlinks</li>
<li>number of changes to any file over time
(subordinately also the number of merges of the file)</li>
<li>number of commits in the ancestry of a branch
(subordinately also the number of merges)</li>
<li>number of revisions in a repository</li>
<li>number of fileids in a repository</li>
<li>number of ghosts in a given graph (revision or per-file)</li>
<li>number of branches in a repository</li>
<li>number of concurrent readers for a tree/branch/repository</li>
<li>number of concurrent writers for objects that support that.</li>
<li>latency to perform file operations (e.g. slow disks, network file systems,
our VFS layer and FTP/SFTP/etc)</li>
<li>bandwidth to the disk storage</li>
<li>latency to perform semantic operations (hpss specific)</li>
<li>bandwidth when performing semantic operations.</li>
</ul>
</div></blockquote>
</li>
<li><p class="first">locality of reference: If an operation requires data that is located
within a small region at any point, we often get better performance
than with an implementation of the same operation that requires the
same amount of data but with a lower locality of reference. It’s
fairly tricky to add locality of reference after the fact, so I think
its worth considering up front.</p>
</li>
</ul>
<p>Using these factors, to the annotate example we can add that its
reasonable to do two ‘semantic’ round trips to the local tree, one to
the branch object, and two to the repository. In file-operation level
measurements, in an ideal world there would be no more than one round
trip for each semantic operation. What there must not be is one round
trip per revision involved in the revisionid-&gt;dotted number mapping, nor
per each revision id attributed to a line in the text.</p>
<p>Not all the items mentioned above are created equal. The analysis should
include the parameters considered and the common case values for each - the
optimisation should be around the common cases not around the exceptions.</p>
<p>For instance, we have a smart server now; file level operations are relatively
low latency and we should use that as the common case. At this point we intend
to preserve the performance of the dumb protocol networking, but focus on
improving network performance via the smart server and thus escape the
file-level operation latency considerations.</p>
<p>Many performance problems only become visible when changing the scaling knobs
upwards to large trees. On small trees it’s our baseline performance that drives
incremental improvements; on large trees it’s the amount of processing per item
that drives performance. A significant goal therefore is to keep the amount of
data to be processed under control. Ideally we can scale in a sublinear fashion
for all operations, but we MUST NOT scale even linearly for operations that
invoke a latency multiplier. For example, reading a file on disk requires
finding the inode for the file, then the block with the data and returning the
contents. Due to directory grouping logic we pay a massive price to read files
if we do not group the reads of files within the same directory.</p>
</div>
</div>
<div class="section" id="use-cases">
<h2><a class="toc-backref" href="#id29">1.3&nbsp;&nbsp;&nbsp;Use cases</a><a class="headerlink" href="#use-cases" title="Permalink to this headline">¶</a></h2>
<div class="section" id="initial-push-pull">
<h3><a class="toc-backref" href="#id30">1.3.1&nbsp;&nbsp;&nbsp;Initial push / pull</a><a class="headerlink" href="#initial-push-pull" title="Permalink to this headline">¶</a></h3>
<div class="section" id="optimal-case">
<h4><a class="toc-backref" href="#id31">1.3.1.1&nbsp;&nbsp;&nbsp;Optimal case</a><a class="headerlink" href="#optimal-case" title="Permalink to this headline">¶</a></h4>
<p>(a motivating example of ultimate performance)
Assume there is a file with exactly the right data in compressed form.  This
may be a tarred branch, a bundle, or a blob format.  Performance in this case
scales with the size of the file.</p>
</div>
<div class="section" id="disk-case">
<h4><a class="toc-backref" href="#id32">1.3.1.2&nbsp;&nbsp;&nbsp;Disk case</a><a class="headerlink" href="#disk-case" title="Permalink to this headline">¶</a></h4>
<p>Assume current repo format.  Attempt to achieve parity with <code class="docutils literal notranslate"><span class="pre">cp</span> <span class="pre">-r</span></code>.  Read
each file only 1 time.</p>
<ul class="simple">
<li>read knit graph for revisions</li>
<li>write filtered copy of revision knit O(d+a)</li>
<li>write filtered copy of knit index O(d)</li>
<li>Open knit index for inventory</li>
<li>Write a filtered copy of inventory knit and simultaneously not all referenced
file-ids O(b+d)</li>
<li>Write filtered copy of inventory knit index O(d)</li>
<li>For each referenced file-id:<ul>
<li>Open knit index for each file knit O(e)</li>
<li>If acceptable threshold of irrelevant data hard-link O(f)</li>
<li>Otherwise write filtered copy of text knit and simultaneously write
the fulltext to tree transform O(h)</li>
</ul>
</li>
<li>Write format markers O(1)</li>
</ul>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">a:</th><td class="field-body">size of aggregate revision metadata</td>
</tr>
<tr class="field-even field"><th class="field-name">b:</th><td class="field-body">size of inventory changes for all revisions</td>
</tr>
<tr class="field-odd field"><th class="field-name">c:</th><td class="field-body">size of text changes for all files and all revisions (e * g)</td>
</tr>
<tr class="field-even field"><th class="field-name">d:</th><td class="field-body">number of relevant revisions</td>
</tr>
<tr class="field-odd field"><th class="field-name">e:</th><td class="field-body">number of relevant versioned files</td>
</tr>
<tr class="field-even field"><th class="field-name">f:</th><td class="field-body">size of the particular versioned file knit index</td>
</tr>
<tr class="field-odd field"><th class="field-name">g:</th><td class="field-body">size of the filtered versioned file knit</td>
</tr>
<tr class="field-even field"><th class="field-name">h:</th><td class="field-body">size of the versioned file fulltext</td>
</tr>
<tr class="field-odd field"><th class="field-name">i:</th><td class="field-body">size of the largest file fulltext</td>
</tr>
</tbody>
</table>
</div>
<div class="section" id="smart-network-case">
<h4><a class="toc-backref" href="#id33">1.3.1.3&nbsp;&nbsp;&nbsp;Smart Network Case</a><a class="headerlink" href="#smart-network-case" title="Permalink to this headline">¶</a></h4>
<div class="section" id="phase-1">
<h5><a class="toc-backref" href="#id34">1.3.1.3.1&nbsp;&nbsp;&nbsp;Phase 1</a><a class="headerlink" href="#phase-1" title="Permalink to this headline">¶</a></h5>
<p>Push: ask if there is a repository, and if not, what formats are okay
Pull: Nothing</p>
</div>
<div class="section" id="phase-2">
<h5><a class="toc-backref" href="#id35">1.3.1.3.2&nbsp;&nbsp;&nbsp;Phase 2</a><a class="headerlink" href="#phase-2" title="Permalink to this headline">¶</a></h5>
<p>Push: send initial push command, streaming data in acceptable format, following
disk case strategy
Pull: receive initial pull command, specifying format</p>
<p>Pull client complexity: O(a), memory cost O(1)
Push client complexity: procesing and memory cost same as disk case</p>
</div>
</div>
<div class="section" id="dumb-network-case">
<h4><a class="toc-backref" href="#id36">1.3.1.4&nbsp;&nbsp;&nbsp;Dumb Network Case</a><a class="headerlink" href="#dumb-network-case" title="Permalink to this headline">¶</a></h4>
<p>Pull: same as disk case, but request all file knit indices at once and request
al file knits at once.
Push: same as disk case, but write all files at once.</p>
</div>
<div class="section" id="wants">
<h4><a class="toc-backref" href="#id37">1.3.1.5&nbsp;&nbsp;&nbsp;Wants</a><a class="headerlink" href="#wants" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>Read partial graph</li>
<li>Read multiple segments of multiple files on HTTP and SFTP</li>
<li>Write multiple files over SFTP</li>
</ul>
</div>
</div>
<div class="section" id="incremental-push-pull">
<h3><a class="toc-backref" href="#id38">1.3.2&nbsp;&nbsp;&nbsp;Incremental push/pull</a><a class="headerlink" href="#incremental-push-pull" title="Permalink to this headline">¶</a></h3>
<p>This use case covers pulling in or pushing out some number of revisions which
is typically a small fraction of the number already present in the target
repository. Pushing and pulling are defined as branch level operations for ease
of interaction with VCS systems that have no repository abstraction (such as
bzr-svn or GNU Arch) but within bzrlib’s core they are currently the
responsibility of the Repository object.</p>
<div class="section" id="functional-requirements">
<h4><a class="toc-backref" href="#id39">1.3.2.1&nbsp;&nbsp;&nbsp;Functional Requirements</a><a class="headerlink" href="#functional-requirements" title="Permalink to this headline">¶</a></h4>
<dl class="docutils">
<dt>A push or pull operation must:</dt>
<dd><ul class="first last simple">
<li>Copy all the data to reconstruct the selected revisions in the target
branch. This is the goal of push and pull after all.</li>
<li>Reject corrupt data. As bzr has no innate mechanism for discarding corrupted
data, corrupted data should not be incorporated accidentally.</li>
</ul>
</dd>
</dl>
</div>
<div class="section" id="factors-which-should-add-work-for-push-pull">
<h4><a class="toc-backref" href="#id40">1.3.2.2&nbsp;&nbsp;&nbsp;Factors which should add work for push/pull</a><a class="headerlink" href="#factors-which-should-add-work-for-push-pull" title="Permalink to this headline">¶</a></h4>
<blockquote>
<div><ul class="simple">
<li>Baseline overhead: The time to connect to both branches.</li>
<li>Actual new data in the revisions being pulled (drives the amount of data to
move around, includes the commit messages etc)</li>
<li>Number of revisions in the two repositories (scaling affects the
determination of what revisions to move around).</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="push-pull-overview">
<h4><a class="toc-backref" href="#id41">1.3.2.3&nbsp;&nbsp;&nbsp;Push/pull overview</a><a class="headerlink" href="#push-pull-overview" title="Permalink to this headline">¶</a></h4>
<ol class="arabic simple">
<li>New data is identified in the source repository.</li>
<li>That data is read from the source repository.</li>
<li>The same data is verified and written to the target repository in such a
manner that it’s not visible to readers until it’s ready for use.</li>
</ol>
<div class="section" id="new-data-identification">
<h5><a class="toc-backref" href="#id42">1.3.2.3.1&nbsp;&nbsp;&nbsp;New data identification</a><a class="headerlink" href="#new-data-identification" title="Permalink to this headline">¶</a></h5>
<p>We have a single top level data object: revisions. Everything else is
subordinate to revisions, so determining the revisions to propagate should be
all thats needed. This depends on revisions with partial data - such as those
with no signature - being flagged in some efficient manner.</p>
<p>We could do this in two manners: determine revisions to sync and signatures to sync in two passes, or change the ‘value’ of a revision implicitly when the signature is different. E.g. by using merkle hash trees with the signature data a separate component the signatures will naturally be identified to sync.</p>
<p>We want to only exchange data proportional to the number of new revisions and
signatures in the system though. One way to achieve this for revisions is to
walk the graph out from the desired tips until the surface area intersection is
found. For signatures a set difference seems to be needed as there is no DAG of signatures: the presence of one has no implications on the presence of another, so a full pass over the set of signatures would be required to confirm no new signatures are needed (let alone replaced signatures).</p>
<p>IFF we can determine ‘new revisions’ and ‘new signatures’ without full graph access then we can scale acceptable for push and pull.</p>
<p>Ghosts are revisions which are not present in a particular repository. Filling ghosts refers to removing ghosts in the target repository when the ghost is present in the source repository. Filling ghosts can be either an explicit or implicit action. The common case is no ghosts.</p>
<div class="section" id="set-synchronisation-approaches">
<h6><a class="toc-backref" href="#id43">1.3.2.3.1.1&nbsp;&nbsp;&nbsp;Set synchronisation approaches</a><a class="headerlink" href="#set-synchronisation-approaches" title="Permalink to this headline">¶</a></h6>
<p>A set synchronisation approach is one which synchronises two sets without
regard for innate structure. This can be very efficient but requires adding a
new node to be processed with every commit. Caching of the results of the
various set based syncs I’ve seen is possible but because the data structures
look different depending on the tip revision being synced up to the cache needs
to be very complex. I recommend not using such an approach for the common case
pull because of the failure to scale. We can use such an approach for
synchronisation of new signatures and ghosts, which should be an explicit
option in both cases.</p>
</div>
<div class="section" id="dag-synchronisation-approaches">
<h6><a class="toc-backref" href="#id44">1.3.2.3.1.2&nbsp;&nbsp;&nbsp;DAG synchronisation approaches</a><a class="headerlink" href="#dag-synchronisation-approaches" title="Permalink to this headline">¶</a></h6>
<p>A DAG based approach to synchronistion is one that uses the DAG structure to
determine the difference in present nodes. It can as a result operate from the
tip of the DAG backwards. A dag based approach should allow incremental access
to data and not require a full-graph scan for incremental operations.</p>
</div>
<div class="section" id="file-level-scaling">
<h6><a class="toc-backref" href="#id45">1.3.2.3.1.3&nbsp;&nbsp;&nbsp;File level scaling</a><a class="headerlink" href="#file-level-scaling" title="Permalink to this headline">¶</a></h6>
<p>We should read roughly as much of the revision level graph as is needed from
each repository to determine the node difference.  If requested we should
perform a detailed scan to pick up ghost revisions and revisions which have had
signatures added. This should not be the default as it requires full history
access in both cases.</p>
<p>Expected file IO and access pattern:</p>
<blockquote>
<div><ul>
<li><p class="first">Common case: repo with many branches of one project, to the same.</p>
<ol class="arabic simple">
<li>Source and Target branch tips read.</li>
<li>Find the tip of each branch in their repo (will require reading some of
the revision graph but is typically near the end of the graph).</li>
<li>Read and parse increasing amounts of the revision graph until one is
found to be a subset of the other, or a complete list of revisions to be
transmitted is created.</li>
</ol>
</li>
<li><p class="first">Uncommon cases:</p>
<ol class="arabic simple">
<li>Repositories with many projects or branches which are very old may
require reading a lot of unrelated graph data.</li>
</ol>
<ol class="arabic simple">
<li>Initial push/pull scenarios should not require reading an entire graph.</li>
</ol>
</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="api-scaling">
<h6><a class="toc-backref" href="#id46">1.3.2.3.1.4&nbsp;&nbsp;&nbsp;API scaling</a><a class="headerlink" href="#api-scaling" title="Permalink to this headline">¶</a></h6>
<blockquote>
<div><ol class="arabic simple">
<li>Get branch tips.</li>
<li>Determine one sided graph difference. To avoid obtaining a full graph over
the wire this needs to be done without reference to the full graph, and
with some logarthmic scaling algorithm. There are several already available
for this.</li>
</ol>
</div></blockquote>
<p>With ghost and new-signature detection:</p>
<blockquote>
<div><ul class="simple">
<li>File IO access pattern will read the entire graph on the ‘target’ side - if
no ghosts are present then stop, otherwise seek the new revisions on the
source side with the regular algorithm and also explicitly search for the
ghost points from the target; plus a set difference search is needed on
signatures.</li>
<li>Semantic level can probably be tuned, but as it’s also complex I suggest
deferring analysis for optimal behaviour of this use case.</li>
</ul>
</div></blockquote>
</div>
</div>
<div class="section" id="data-reading">
<h5><a class="toc-backref" href="#id47">1.3.2.3.2&nbsp;&nbsp;&nbsp;Data reading</a><a class="headerlink" href="#data-reading" title="Permalink to this headline">¶</a></h5>
<p>When transferring information about a revision the graph of data for the
revision is walked: revision -&gt; inventory, revision -&gt; matching signature,
inventory -&gt; file ids:revision pairs.</p>
<div class="section" id="id1">
<h6><a class="toc-backref" href="#id48">1.3.2.3.2.1&nbsp;&nbsp;&nbsp;File level scaling</a><a class="headerlink" href="#id1" title="Permalink to this headline">¶</a></h6>
<p>As we’re reading already committed data, as long as nothing is mutating data on
disk reading should be race free. We will:</p>
<blockquote>
<div><ul class="simple">
<li>read each revision object</li>
<li>read the matching inventory delta</li>
<li>attempt to read a signature object</li>
<li>parse the inventory delta</li>
<li>read the fileid:revisionid compressed chunk for each line in the inventory
delta</li>
</ul>
</div></blockquote>
<p>Theres no point validating that the data read is valid, as transmission through to the client writing the data might invalidate it; we need to validate before we write.</p>
</div>
<div class="section" id="id2">
<h6><a class="toc-backref" href="#id49">1.3.2.3.2.2&nbsp;&nbsp;&nbsp;API scaling</a><a class="headerlink" href="#id2" title="Permalink to this headline">¶</a></h6>
<p>Given that we have established the revisions needed, a single API call should
suffice to obtain all data; the API should present the data in such an order
that it can be validated as it arrives and thus not require large scale
buffering on disk. Specifically each item of data should be validatable (e.g.
for some file data we want the fileid:revisionid:validationhash + content).</p>
</div>
</div>
<div class="section" id="data-verification-and-writing">
<h5><a class="toc-backref" href="#id50">1.3.2.3.3&nbsp;&nbsp;&nbsp;Data Verification and writing</a><a class="headerlink" href="#data-verification-and-writing" title="Permalink to this headline">¶</a></h5>
<p>New data written to a repository should be completed intact when it is made
visible. This suggests that either all the data for a revision must be made
atomically visible (e.g. by renaming a single file) or the leaf nodes of the
reference graph must become visible first.</p>
<p>Data is referred to via the following graph:
revision -&gt; revision
revision -&gt; signature
revision -&gt; inventory
inventory -&gt; fileid:revisionid
fileid:revisionid -&gt; fileid:revisionid</p>
<p>Data is verifiable via a different ordering:
signature -&gt; revision -&gt; inventory -&gt; fileid:revisionid texts.</p>
<p>We dont gpg verify each revision today; this analysis only speaks to hash
verification of contents.</p>
<p>To validate a revision we need to validate the data it refers to. But to
validate the contents of a revision we need the new texts in the inventory for
the revision - to check a fileid:revisionid we need to know the expected sha1
of the full text and thus also need to read the delta chain to construct the
text as we accept it to determine if it’s valid. Providing separate validators
for the chosen representation would address this.
e.g: For an inventory entry FILEID:REVISIONID we store the validator of the
full text :SHA1:. If we also stored the validator of the chosen disk
representation (:DELTASHA1:) we could validate the transmitted representation
without expanding the delta in the common case. If that failed we could expand
the delta chain and try against the full text validator, and finally fail. As
different delta generators might generate different deltas, :DELTASHA1: should
not become part of the revision validator, only the inventory disk encoding. In
a related manner a transmission format that allowed cheap validation of content
without applying locally stored deltas would be advantageous because no local
reads would be incurred to validate new content. For instance, always sending a
full text for any file, possibly with a delta-chain when transmitting multiple
revisionids of the file, would allow this. (git pack-files have this property).</p>
<div class="section" id="overview-summary">
<h6><a class="toc-backref" href="#id51">1.3.2.3.3.1&nbsp;&nbsp;&nbsp;Overview summary</a><a class="headerlink" href="#overview-summary" title="Permalink to this headline">¶</a></h6>
<p>A single-file local format would allow safe atomic addition of data while
allowing optimisal transmission order of data. Failing this the validation of
data should be tuned to not require reading local texts during data addition
even in the presence of delta chains. We should have transmission-validators
separate from content validators that allow validation of the delta-transmitted
form of objects.</p>
</div>
<div class="section" id="id3">
<h6><a class="toc-backref" href="#id52">1.3.2.3.3.2&nbsp;&nbsp;&nbsp;File level scaling</a><a class="headerlink" href="#id3" title="Permalink to this headline">¶</a></h6>
<ul class="simple">
<li>Every new file text requires transmission and local serialisation.</li>
<li>Every commit requires transmission and storage of a revision, signature and inventory.</li>
</ul>
<p>Thus 4000 commits to a 50000 path tree of 10 files on averages requires (with
knits) between 26 writes (2*(3+10)) and 80006 (2*(4000*10 + 3)) writes. In all
cases there are 4000 * 13 distinct objects to record.</p>
<p>Grouping data by fileid, content and metadata, gives the figures above.
Data grouping:</p>
<ul class="simple">
<li>File per full identifier (fileid:revisionid:meta|content): 104000</li>
<li>Delta-chain per object: object id count * constant overhead per object id
(26 -&gt; 80006)</li>
<li>Collation/pack file: 1</li>
</ul>
<dl class="docutils">
<dt>Performance for these depends heavily on implementation:</dt>
<dd><ul class="first last simple">
<li>Using full ids we could name by validator or by id, giving best performance
that depends on either receiving data in validator order or in id order.</li>
<li>using delta-chain per object we get least seek overhead and syscall overhead
if we recieve in topological order within the object id, and object ids in
lexical order.</li>
<li>Using a collation/pack file we can stream it into place and validate as we go,
giving near ideal performance.</li>
</ul>
</dd>
</dl>
</div>
<div class="section" id="id4">
<h6><a class="toc-backref" href="#id53">1.3.2.3.3.3&nbsp;&nbsp;&nbsp;API scaling</a><a class="headerlink" href="#id4" title="Permalink to this headline">¶</a></h6>
<p>The api for writing new data recieved over the network will need to be geared
to the transmission and local storage method. What we need is for the
transmission method to reasonably closely match the desired write ordering
locally. This suggests that once we decide on the best local storage means we
should design the api.</p>
<p>take N commits from A to B, if B is local then merge changes into the tree.
copy ebough data to recreate snapshots
avoid ending up wth corrupt/bad data</p>
</div>
</div>
</div>
<div class="section" id="notes-from-london">
<h4><a class="toc-backref" href="#id54">1.3.2.4&nbsp;&nbsp;&nbsp;Notes from London</a><a class="headerlink" href="#notes-from-london" title="Permalink to this headline">¶</a></h4>
<blockquote>
<div><ol class="arabic simple">
<li>setup</li>
</ol>
<blockquote>
<div><p>look at graph of revisions for ~N comits to deretmine eligibility for
if preserve mainline is on, check LH only</p>
<blockquote>
<div><dl class="docutils">
<dt>identify objects to send that are not on the client repo</dt>
<dd><ul class="first last simple">
<li>revision - may be proportional to the graph</li>
<li>inventory - proportional to work</li>
<li>texts     - proportional to work</li>
<li>signatures - ???</li>
</ul>
</dd>
</dl>
</div></blockquote>
</div></blockquote>
<ol class="arabic simple">
<li>data transmission</li>
</ol>
<blockquote>
<div><ul class="simple">
<li>send data proportional to the new information</li>
<li>validate the data:</li>
</ul>
<blockquote>
<div><ol class="arabic simple">
<li>validate the sha1 of the full text of each transmitted text.</li>
<li>validate the sha1:name mapping in each newly referenced inventory item.</li>
<li>validate the sha1 of the XML of each inventory against the revision.
<strong>this is proportional to tree size and must be fixed</strong></li>
</ol>
</div></blockquote>
</div></blockquote>
<ol class="arabic simple">
<li>write the data to the local repo.
The API should output the file texts needed by the merge as by product of the transmission</li>
<li>tree application</li>
</ol>
</div></blockquote>
<p>Combine the output from the transmission step with additional ‘new work data’ for anything already in the local repository that is new in this tree.
should write new files and stat existing files proportional to the count of the new work and the size of the full texts.</p>
</div>
</div>
<div class="section" id="add">
<h3><a class="toc-backref" href="#id55">1.3.3&nbsp;&nbsp;&nbsp;Add</a><a class="headerlink" href="#add" title="Permalink to this headline">¶</a></h3>
<p>Add is used to recursively version some paths supplied by the user. Paths that
match ignore rules are not versioned, and paths that become versioned are
versioned in the nearest containing bzr tree. Currently we only do this within
a single tree, but perhaps with nested trees this should change.</p>
<div class="section" id="least-work-we-can-hope-to-perform">
<h4><a class="toc-backref" href="#id56">1.3.3.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a><a class="headerlink" href="#least-work-we-can-hope-to-perform" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>Read a subset of the full versioned paths data for the tree matching the scope of the paths the user supplied.</li>
<li>Seek once to each directory within the scope and readdir its contents.</li>
<li>Probe if each directory is a child tree to avoid adding data for paths within a child tree.</li>
<li>Calculate the ignored status for paths not previously known to be ignored</li>
<li>Write data proportional to the newly versioned file count to record their versioning.</li>
<li>Assign a fileid for each path (so that merge –uncommitted can work immediately)</li>
</ul>
<p>Optionally:</p>
<ul class="simple">
<li>Print the ignore rule for each ignored path in the scope.</li>
<li>Print the path of each added file.</li>
<li>Print the total count of ignored files within the scopes.</li>
<li>Record the result of calculating ignored status for ignored files.
(proportional to the number we actually calculate).</li>
</ul>
</div>
<div class="section" id="per-file-algorithm">
<h4><a class="toc-backref" href="#id57">1.3.3.2&nbsp;&nbsp;&nbsp;Per file algorithm</a><a class="headerlink" href="#per-file-algorithm" title="Permalink to this headline">¶</a></h4>
<ol class="arabic simple">
<li>If the path is versioned, and it is a directory, push onto the recurse stack.</li>
<li>If the path is supplied by the user or is not ignored, version it, and if a
directory, push onto the recurse stack. Versioning the path may require
versioning the paths parents.</li>
<li>Output or otherwise record the ignored rule as per the user interface selected.</li>
</ol>
</div>
</div>
<div class="section" id="commit-performance-notes">
<h3><a class="toc-backref" href="#id58">1.3.4&nbsp;&nbsp;&nbsp;Commit Performance Notes</a><a class="headerlink" href="#commit-performance-notes" title="Permalink to this headline">¶</a></h3>
<div class="contents local topic" id="id5">
<ul class="auto-toc simple">
<li><a class="reference internal" href="#changes-to-commit" id="id114">1.3.4.1&nbsp;&nbsp;&nbsp;Changes to commit</a></li>
<li><a class="reference internal" href="#commit-the-minimum-work-required" id="id115">1.3.4.2&nbsp;&nbsp;&nbsp;Commit: The Minimum Work Required</a></li>
<li><a class="reference internal" href="#commit-vs-status" id="id116">1.3.4.3&nbsp;&nbsp;&nbsp;Commit vs Status</a></li>
<li><a class="reference internal" href="#avoiding-work-smarter-change-detection" id="id117">1.3.4.4&nbsp;&nbsp;&nbsp;Avoiding Work: Smarter Change Detection</a></li>
<li><a class="reference internal" href="#avoiding-work-better-layering" id="id118">1.3.4.5&nbsp;&nbsp;&nbsp;Avoiding Work: Better Layering</a></li>
<li><a class="reference internal" href="#avoiding-work-avoiding-reading-parent-data" id="id119">1.3.4.6&nbsp;&nbsp;&nbsp;Avoiding work: avoiding reading parent data</a></li>
<li><a class="reference internal" href="#code-structure" id="id120">1.3.4.7&nbsp;&nbsp;&nbsp;Code structure</a></li>
<li><a class="reference internal" href="#complications-of-commit" id="id121">1.3.4.8&nbsp;&nbsp;&nbsp;Complications of commit</a></li>
<li><a class="reference internal" href="#interface-stack" id="id122">1.3.4.9&nbsp;&nbsp;&nbsp;Interface stack</a></li>
<li><a class="reference internal" href="#branch-tree-interface" id="id123">1.3.4.10&nbsp;&nbsp;&nbsp;Branch-&gt;Tree interface</a></li>
<li><a class="reference internal" href="#information-from-the-tree-to-repository" id="id124">1.3.4.11&nbsp;&nbsp;&nbsp;Information from the tree to repository</a></li>
<li><a class="reference internal" href="#information-from-the-repository-to-the-tree" id="id125">1.3.4.12&nbsp;&nbsp;&nbsp;Information from the repository to the tree</a></li>
<li><a class="reference internal" href="#selective-commit" id="id126">1.3.4.13&nbsp;&nbsp;&nbsp;Selective commit</a></li>
<li><a class="reference internal" href="#common-commit-code" id="id127">1.3.4.14&nbsp;&nbsp;&nbsp;Common commit code</a></li>
<li><a class="reference internal" href="#order-of-traversal" id="id128">1.3.4.15&nbsp;&nbsp;&nbsp;Order of traversal</a></li>
<li><a class="reference internal" href="#open-question-per-file-graphs" id="id129">1.3.4.16&nbsp;&nbsp;&nbsp;Open question: per-file graphs</a></li>
</ul>
</div>
<div class="section" id="changes-to-commit">
<h4><a class="toc-backref" href="#id114">1.3.4.1&nbsp;&nbsp;&nbsp;Changes to commit</a><a class="headerlink" href="#changes-to-commit" title="Permalink to this headline">¶</a></h4>
<p>We want to improve the commit code in two phases.</p>
<p>Phase one is to have a better separation from the format-specific logic,
the user interface, and the general process of committing.</p>
<p>Phase two is to have better interfaces by which a good workingtree format
can efficiently pass data to a good storage format.  If we get phase one
right, it will be relatively easy and non-disruptive to bring this in.</p>
</div>
<div class="section" id="commit-the-minimum-work-required">
<h4><a class="toc-backref" href="#id115">1.3.4.2&nbsp;&nbsp;&nbsp;Commit: The Minimum Work Required</a><a class="headerlink" href="#commit-the-minimum-work-required" title="Permalink to this headline">¶</a></h4>
<p>Here is a description of the minimum work that commit must do.  We
want to make sure that our design doesn’t cost too much more than this
minimum.  I am trying to do this without making too many assumptions
about the underlying storage, but am assuming that the ui and basic
architecture (wt, branch, repo) stays about the same.</p>
<p>The basic purpose of commit is to:</p>
<ol class="arabic simple">
<li>create and store a new revision based on the contents of the working tree</li>
<li>make this the new basis revision for the working tree</li>
</ol>
<p>We can do a selected commit of only some files or subtrees.</p>
<p>The best performance we could hope for is:
- stat each versioned selected working file once
- read from the workingtree and write into the repository any new file texts
- in general, do work proportional to the size of the shape (eg
inventory) of the old and new selected trees, and to the total size of
the modified files</p>
<p>In more detail:</p>
<p>1.0 - Store new file texts: if a versioned file contains a new text
there is no avoiding storing it.  To determine which ones have changed
we must go over the workingtree and at least stat each file.  If the
file is modified since it was last hashed, it must be read in.
Ideally we would read it only once, and either notice that it has not
changed, or store it at that point.</p>
<p>On the other hand we want new code to be able to handle files that are
larger than will fit in memory.  We may then need to read each file up
to two times: once to determine if there is a new text and calculate
its hash, and again to store it.</p>
<p>1.1 - Store a tree-shape description (ie inventory or similar.)  This
describes the non-file objects, and provides a reference from the
Revision to the texts within it.</p>
<p>1.2 - Generate and store a new revision object.</p>
<p>1.3 - Do delta-compression on the stored objects.  (git notably does
not do this at commit time, deferring this entirely until later.)
This requires finding the appropriate basis for each modified file: in
the current scheme we get the file id, last-revision from the
dirstate, look into the knit for that text, extract that text in
total, generate a delta, then store that into the knit.  Most delta
operations are O(n**2) to O(n**3) in the size of the modified files.</p>
<p>1.4 - Cache annotation information for the changes: at the moment this
is done as part of the delta storage.  There are some flaws in that
approach, such as that it is not updated when ghosts are filled, and
the annotation can’t be re-run with new diff parameters.</p>
<p>2.1 - Make the new revision the basis for the tree, and clear the list
of parents.  Strictly this is all that’s logically necessary, unless
the working tree format requires more work.</p>
<p>The dirstate format does require more work, because it caches the
parent tree data for each file within the working tree data.  In
practice this means that every commit rewrites the entire dirstate
file - we could try to avoid rewriting the whole file but this may be
difficult because variable-length data (the last-changed revision id)
is inserted into many rows.</p>
<p>The current dirstate design then seems to mean that any commit of a
single file imposes a cost proportional to the size of the current
workingtree.  Maybe there are other benefits that outweigh this.
Alternatively if it was fast enough for operations to always look at
the original storage of the parent trees we could do without the
cache.</p>
<p>2.2 - Record the observed file hashes into the workingtree control
files.  For the files that we just committed, we have the information
to store a valid hash cache entry: we know their stat information and
the sha1 of the file contents.  This is not strictly necessary to the
speed of commit, but it will be useful later in avoiding reading those
files, and the only cost of doing it now is writing it out.</p>
<p>In fact there are some user interface niceties that complicate this:</p>
<p>3 - Before starting the commit proper, we prompt for a commit message
and in that commit message editor we show a list of the files that
will be committed: basically the output of bzr status.  This is
basically the same as the list of changes we detect while storing the
commit, but because the user will sometimes change the tree after
opening the commit editor and expect the final state to be committed I
think we do have to look for changes twice.  Since it takes the user a
while to enter a message this is not a big problem as long as both the
status summary and the commit are individually fast.</p>
<p>4 - As the commit proceeds (or after?) we show another status-like
summary.  Just printing the names of modified files as they’re stored
would be easy.  Recording deleted and renamed files or directories is
more work: this can only be done by reference to the primary parent
tree and requires it be read in.  Worse, reporting renames requires
searching by id across the entire parent tree.   Possibly full
reporting should be a default-off verbose option because it does
require more work beyond the commit itself.</p>
<p>5 - Bazaar currently allows for missing files to be automatically
marked as removed at the time of commit.  Leaving aside the ui
consequences, this means that we have to update the working inventory
to mark these files as removed.  Since as discussed above we always
have to rewrite the dirstate on commit this is not substantial, though
we should make sure we do this in one pass, not two.  I have
previously proposed to make this behaviour a non-default option.</p>
<p>We may need to run hooks or generate signatures during commit, but
they don’t seem to have substantial performance consequences.</p>
<p>If one wanted to optimize solely for the speed of commit I think
hash-addressed  file-per-text storage like in git (or bzr 0.1) is very
good.  Remarkably, it does not need to read the inventory for the
previous revision.  For each versioned file, we just need to get its
hash, either by reading the file or validating its stat data.  If that
hash is not already in the repository, the file is just copied in and
compressed.  As directories are traversed, they’re turned into texts
and stored as well, and then finally the revision is too.  This does
depend on later doing some delta compression of these texts.</p>
<p>Variations on this are possible.  Rather than writing a single file
into the repository for each text, we could fold them into a single
collation or pack file.  That would create a smaller number of files
in the repository, but looking up a single text would require looking
into their indexes rather than just asking the filesystem.</p>
<p>Rather than using hashes we can use file-id/rev-id pairs as at
present, which has several consequences pro and con.</p>
</div>
<div class="section" id="commit-vs-status">
<h4><a class="toc-backref" href="#id116">1.3.4.3&nbsp;&nbsp;&nbsp;Commit vs Status</a><a class="headerlink" href="#commit-vs-status" title="Permalink to this headline">¶</a></h4>
<p>At first glance, commit simply stores the changes status reports. In fact,
this isn’t technically correct: commit considers some files modified that
status does not. The notes below were put together by John Arbash Meinel
and Aaron Bentley in May 2007 to explain the finer details of commit to
Ian Clatworthy. They are recorded here as they are likely to be useful to
others new to Bazaar …</p>
<ol class="arabic simple">
<li><strong>Unknown files have a different effect.</strong> With –no-strict (the default)
they have no effect and can be completely ignored. With –strict they
should cause the commit to abort (so you don’t forget to add the two new
test files that you just created).</li>
<li><strong>Multiple parents.</strong> ‘status’ always compares 2 trees, typically the
last-committed tree and the current working tree. ‘commit’ will compare
more trees if there has been a merge.</li>
</ol>
<blockquote>
<div><ol class="loweralpha simple">
<li>The “last modified” property for files.
A file may be marked as changed since the last commit, but that
change may have come in from the merge, and the change could have
happened several commits back. There are several edge cases to be
handled here, like if both branches modified the same file, or if
just one branch modified it.</li>
<li>The trickier case is when a file appears unmodified since last
commit, but it was modified versus one of the merged branches. I
believe there are a few ways this can happen, like if a merged
branch changes a file and then reverts it back (you still update
the ‘last modified’ field).
In general, if both sides disagree on the ‘last-modified’ flag,
then you need to generate a new entry pointing ‘last-modified’ at
this revision (because you are resolving the differences between
the 2 parents).</li>
</ol>
</div></blockquote>
<ol class="arabic" start="3">
<li><p class="first"><strong>Automatic deletion of ‘missing’ files.</strong> This is a point that we go
back and forth on. I think the basic idea is that ‘bzr commit’ by
default should abort if it finds a ‘missing’ file (in case that file was
renamed rather than deleted), but ‘bzr commit –auto’ can add unknown
files and remove missing files automatically.</p>
</li>
<li><p class="first"><strong>sha1 for newly added files.</strong> status doesn’t really need this: it should
only care that the file is not present in base, but is present now. In
some ways commit doesn’t care either, since it needs to read and sha the
file itself anyway.</p>
</li>
<li><p class="first"><strong>Nested trees.</strong> status doesn’t recurse into nested trees, but commit does.
This is just because not all of the nested-trees work has been merged yet.</p>
<p>A tree-reference is considered modified if the subtree has been
committed since the last containing-tree commit.  But commit needs to
recurse into every subtree, to ensure that a commit is done if the
subtree has changed since its last commit.  _iter_changes only reports
on tree-references that are modified, so it can’t be used for doing
subtree commits.</p>
</li>
</ol>
</div>
<div class="section" id="avoiding-work-smarter-change-detection">
<h4><a class="toc-backref" href="#id117">1.3.4.4&nbsp;&nbsp;&nbsp;Avoiding Work: Smarter Change Detection</a><a class="headerlink" href="#avoiding-work-smarter-change-detection" title="Permalink to this headline">¶</a></h4>
<p>Commit currently walks through every file building an inventory. Here is
Aaron’s brain dump on a better way …</p>
<p>_iter_changes won’t tell us about tree references that haven’t changed,
even if those subtrees have changed.  (Unless we ask for unchanged
files, which we don’t want to do, of course.)</p>
<p>There is an iter_references method, but using it looks just as expensive
as calling kind().</p>
<p>I did some work on updating commit to use iter_changes, but found for
multi-parent trees, I had to fall back to the slow inventory comparison
approach.</p>
<p>Really, I think we need a call akin to iter_changes that handles
multiple parents, and knows to emit entries when InventoryEntry.revision
is all that’s changed.</p>
</div>
<div class="section" id="avoiding-work-better-layering">
<h4><a class="toc-backref" href="#id118">1.3.4.5&nbsp;&nbsp;&nbsp;Avoiding Work: Better Layering</a><a class="headerlink" href="#avoiding-work-better-layering" title="Permalink to this headline">¶</a></h4>
<p>For each file, commit is currently doing more work than it should. Here is
John’s take on a better way …</p>
<p>Note that “_iter_changes” <em>does</em> have to touch every path on disk, but
it just can do it in a more efficient manner. (It doesn’t have to create
an InventoryEntry for all the ones that haven’t changed).</p>
<p>I agree with Aaron that we need something a little different than
_iter_changes. Both because of handling multiple parents, as well as we
don’t want it to actually read the files if we have a stat-cache miss.</p>
<p>Specifically, the commit code <em>has</em> to read the files because it is
going to add the text to the repository, and we want it to compute the
sha1 at <em>that</em> time, so we are guaranteed to have the valid sha (rather
than just whatever the last cached one was). So we want the code to
return ‘None’ if it doesn’t have an up-to-date sha1, rather than reading
the file and computing it, just before it returns it to the parent.</p>
<p>The commit code (0.16) should really be restructured. It’s layering is
pretty wrong.</p>
<p>Specifically, calling “kind()” requires a stat of the file. But we have
to do a stat to get the size/whether the record is up-to-date, etc. So
we really need to have a “create_an_up_to_date_inventory()” function.
But because we are accessing every object on disk, we want to be working
in tuples rather than Inventory objects. And because DirState already
has the parent records next to the current working inventory, it can do
all the work to do really fast comparison and throw-away of unimportant
records.</p>
<p>The way I made “bzr status” fast is by moving the ‘ignore this record’
ability as deep into the stack as I could get. Status has the property
that you don’t care about most of the records, just like commit. So the
sooner you can stop evaluating the 99% that you don’t care about, the
less work you do.</p>
</div>
<div class="section" id="avoiding-work-avoiding-reading-parent-data">
<h4><a class="toc-backref" href="#id119">1.3.4.6&nbsp;&nbsp;&nbsp;Avoiding work: avoiding reading parent data</a><a class="headerlink" href="#avoiding-work-avoiding-reading-parent-data" title="Permalink to this headline">¶</a></h4>
<p>We would like to avoid the work of reading any data about the parent
revisions.  We should at least try to avoid reading anything from the
repository; we can also consider whether it is possible or useful to hold
less parent information in the working tree.</p>
<p>When a commit of selected files is requested, the committed snapshot is a
composite of some directories from the parent revision and some from the
working tree.  In this case it is logically necessary to have the parent
inventory information.</p>
<p>If file last-change information or per-file graph information is stored
then it must be available from the parent trees.</p>
<p>If the Branch’s storage method does delta compression at commit time it
may need to retrieve file or inventory texts from the repository.</p>
<p>It is desirable to avoid roundtrips to the Repository during commit,
particularly because it may be remote.  If the WorkingTree can determine
by itself that a text was in the parent and therefore should be in the
Repository that avoids one roundtrip per file.</p>
<p>There is a possibility here that the parent revision is not stored, or not
correctly stored, in the repository the tree is being committed into, and
so the committed tree would not be reconstructable.  We could check that
the parent revision is present in the inventory and rely on the invariant
that if a revision is present, everything to reconstruct it will be
present too.</p>
</div>
<div class="section" id="code-structure">
<h4><a class="toc-backref" href="#id120">1.3.4.7&nbsp;&nbsp;&nbsp;Code structure</a><a class="headerlink" href="#code-structure" title="Permalink to this headline">¶</a></h4>
<p>Caller starts a commit</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Branch</span><span class="o">.</span><span class="n">commit</span><span class="p">(</span><span class="n">from_tree</span><span class="p">,</span> <span class="n">options</span><span class="p">)</span>
</pre></div>
</div>
<p>This creates a CommitBuilder object matched to the Branch, Repository and
Tree.  It can vary depending on model differences or by knowledge of what
is efficient with the Repository and Tree.  Model differences might
include whether no-text-change merges need to be reported, and whether the</p>
<p>The basic CommitBuilder.commit structure can be</p>
<ol class="arabic simple">
<li>Ask the branch if it is ready to commit (up to date with master if
any.)</li>
<li>Ask the tree if it is ready to commit to the branch (up to date with
branch?), no conflicts, etc</li>
<li>Commit changed files; prototype implementation:<ol class="loweralpha">
<li>Ask the working tree for all committable files; for each it should
return the per-file parents, stat information, kind, etc.</li>
<li>Ask the repository to store the new file text; the repository should
return the stored sha1 and new revision id.</li>
</ol>
</li>
<li>Commit changed inventory</li>
<li>Commit revision object</li>
</ol>
</div>
<div class="section" id="complications-of-commit">
<h4><a class="toc-backref" href="#id121">1.3.4.8&nbsp;&nbsp;&nbsp;Complications of commit</a><a class="headerlink" href="#complications-of-commit" title="Permalink to this headline">¶</a></h4>
<p>Bazaar (as of 0.17) does not support selective-file commit of a merge;
this could be done if we decide how it should be recorded - is this to be
stored as an overall merge revision; as a preliminary non-merge revisions;
or will the per-file graph diverge from the revision graph.</p>
<p>There are several checks that may cause the commit to be refused, which
may be activated or deactivated by options.</p>
<ul class="simple">
<li>presence of conflicts in the tree</li>
<li>presence of unknown files</li>
<li>the working tree basis is up to date with the branch tip</li>
<li>the local branch is up to date with the master branch, if there
is one and –local is not specified</li>
<li>an empty commit message is given,</li>
<li>a hook flags an error</li>
<li>a “pointless” commit, with no inventory changes</li>
</ul>
<p>Most of these require walking the tree and can be easily done while
recording the tree shape.  This does require that it be possible to abort
the commit after the tree changes have been recorded.  It could be ok to
either leave the unreachable partly-committed records in the repository,
or to roll back.</p>
<p>Other complications:</p>
<ul class="simple">
<li>when automatically adding new files or deleting missing files during
commit, they must be noted during commit and written into the working
tree at some point</li>
<li>refuse “pointless” commits with no file changes - should be easy by
just refusing to do the final step of storing a new overall inventory
and revision object</li>
<li>heuristic detection of renames between add and delete (out of scope for
this change)</li>
<li>pushing changes to a master branch if any</li>
<li>running hooks, pre and post commit</li>
<li>prompting for a commit message if necessary, including a list of the
changes that have already been observed</li>
<li>if there are tree references and recursing into them is enabled, then
do so</li>
</ul>
<p>Commit needs to protect against duplicated file ids</p>
<p>Updates that need to be made in the working tree, either on conclusion
of commit or during the scan, include</p>
<ul class="simple">
<li>Changes made to the tree shape, including automatic adds, renames or
deletes</li>
<li>For trees (eg dirstate) that cache parent inventories, the old parent
information must be removed and the new one inserted</li>
<li>The tree hashcache information should be updated to reflect the stat
value at which the file was the same as the committed version, and the
content hash it was observed to have.  This needs to be done carefully to
prevent inconsistencies if the file is modified during or shortly after
the commit.  Perhaps it would work to read the mtime of the file before we
read its text to commit.</li>
</ul>
</div>
<div class="section" id="interface-stack">
<h4><a class="toc-backref" href="#id122">1.3.4.9&nbsp;&nbsp;&nbsp;Interface stack</a><a class="headerlink" href="#interface-stack" title="Permalink to this headline">¶</a></h4>
<p>The commit api is invoked by the command interface, and copies information
from the tree into the branch and its repository, possibly updating the
WorkingTree afterwards.</p>
<p>The command interface passes:</p>
<ul class="simple">
<li>a commit message (from an option, if any),</li>
<li>or an indication that it should be read interactively from the ui object;</li>
<li>a list of files to commit</li>
<li>an option for a dry-run commit</li>
<li>verbose option, or callback to indicate</li>
<li>timestamp, timezone, committer, chosen revision id</li>
<li>config (for what?)</li>
<li>option for local-only commit on a bound branch</li>
<li>option for strict commits (fail if there are unknown or missing files)</li>
<li>option to allow “pointless” commits (with no tree changes)</li>
</ul>
<p>(This is rather a lot of options to pass individually and just for code tidyness maybe some of them should be combine into objects.)</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="gp">&gt;&gt;&gt; </span><span class="n">Branch</span><span class="o">.</span><span class="n">commit</span><span class="p">(</span><span class="n">from_tree</span><span class="p">,</span> <span class="n">message</span><span class="p">,</span> <span class="n">files_to_commit</span><span class="p">,</span> <span class="o">...</span><span class="p">)</span>
</pre></div>
</div>
<p>There will be different implementations of this for different Branch
classes, whether for foreign branches or Bazaar repositories using
different storage methods.</p>
<p>Most of the commit should occur during a single lockstep iteration across
the workingtree and parent trees.  The WorkingTree interface needs to
provide methods that give commit all it needs.  Some of these methods
(such as answering the file’s last change revision) may be deprecated in
newer working trees and there we have a choice of either calculating the
value from the data that is present, or refusing to support commit to
newer repositories.</p>
<p>For a dirstate tree the iteration of changes from the parent can easily be
done within its own iter_changes.</p>
<p>Dirstate inventories may be most easily updated in a single operation at
the end; however it may be best to accumulate data as we proceed through
the tree rather than revisiting it at the end.</p>
<p>Showing a progress bar for commit may not be necessary if we report files
as they are committed.  Alternatively we could transiently show a progress
bar for each directory that’s scanned, even if no changes are observed.</p>
<p>This needs to collect a list of added/changed/removed files, each of which
must have its text stored (if any) and containing directory updated.  This
can be done by calling Tree._iter_changes on the source tree, asking for
changes</p>
<p>In the 0.17 model the commit operation needs to know the per-file parents
and per-file last-changed revision.</p>
<p>(In this and other operations we must avoid having multiple layers walk
over the tree separately.  For example, it is no good to have the Command
layer walk the tree to generate a list of all file ids to commit, because
the tree will also be walked later.  The layers that do need to operate
per-file should probably be bound together in a per-dirblock iterator,
rather than each iterating independently.)</p>
</div>
<div class="section" id="branch-tree-interface">
<h4><a class="toc-backref" href="#id123">1.3.4.10&nbsp;&nbsp;&nbsp;Branch-&gt;Tree interface</a><a class="headerlink" href="#branch-tree-interface" title="Permalink to this headline">¶</a></h4>
<p>The Branch commit code needs to ask the Tree what should be committed, in
terms of changes from the parent revisions.  If the Tree holds all the
necessary parent tree information itself it can do it single handed;
otherwise it may need to ask the Repository for parent information.</p>
<p>This should be a streaming interface, probably like iter_changes returning
information per directory block.</p>
<p>The interface should not return a block for directories that are
recursively unchanged.</p>
<p>The tree’s idea of what is possibly changed may be more conservative than
that of the branch.  For example the tree may report on merges of files
where the text is identical to the parents: this must be recorded for
Bazaar branches that record per-file ancestry but is not necessary for all
branches.  If the tree is responsible for determining when directories
have been recursively modified then it will report on all the parents of
such files.  There are several implementation options:</p>
<p>1. Return all files and directories the branch might want to commit, even
if the branch ends up taking no action on them.</p>
<p>2. When starting the iteration, the branch can specify what type of change
is considered interesting.</p>
<p>Since these types of changes are probably (??) rare compared to files that
are either completely unmodified or substantially modified, the first may
be the best and simplest option.</p>
<p>The branch needs to build an inventory to commit, which must include
unchanged files within changed directories.  This should be returned from
the working tree too.  Repositories that store per-directory inventories
will want to build and store these from the lowest directories up.
For 0.17 format repositories with an all-in-one inventory it may be
easiest to accumulate inventory entries in arbitrary order into an
in-memory Inventory and then serialize it.</p>
<p>It ought to be possible to commit any Tree into a Branch, without
requiring a WorkingTree; the commit code should cope if the tree is not
interested in updating hashcache information or does not have a
<code class="docutils literal notranslate"><span class="pre">last_revision</span></code>.</p>
</div>
<div class="section" id="information-from-the-tree-to-repository">
<h4><a class="toc-backref" href="#id124">1.3.4.11&nbsp;&nbsp;&nbsp;Information from the tree to repository</a><a class="headerlink" href="#information-from-the-tree-to-repository" title="Permalink to this headline">¶</a></h4>
<p>The main things the tree needs to tell the Branch about are:</p>
<ul>
<li><p class="first">A file is modified from its parent revision (in text, permissions,
other), and so its text may need to be stored.</p>
<p>Files should also be reported if they have more than one unique parent
revision, for repositories that store per-file graphs or last-change
revisions.  Perhaps this behaviour should be optional.</p>
<p><strong>XXX:</strong> are renames/deletions reported here too?</p>
</li>
<li><p class="first">The complete contents of a modified directory, so that its inventory
text may be stored.  This should be done after all the contained files
and directories have been reported.  If there are unmodified files,
or unselected files carried through from</p>
<p>XXX: Actually perhaps not grouped by directory, but rather grouped
appropriately for the shape of inventory storage in the repository.</p>
<p>In a zoomed-in checkout the workingtree may not have all the shape data
for the entire tree.</p>
</li>
<li><p class="first">A file is missing – could cause either automatic removal or an aborted
commit.</p>
</li>
<li><p class="first">Any unknown files – can cause automatic addition, abortion of a strict
commit, or just reporting.</p>
</li>
</ul>
</div>
<div class="section" id="information-from-the-repository-to-the-tree">
<h4><a class="toc-backref" href="#id125">1.3.4.12&nbsp;&nbsp;&nbsp;Information from the repository to the tree</a><a class="headerlink" href="#information-from-the-repository-to-the-tree" title="Permalink to this headline">¶</a></h4>
<p>After the commit the tree needs to be updated to the new revision.  Some
information which was accumulated during the commit must be made available
to the workingtree.  It’s probably reasonable to hold it all in memory and
allow the workingtree to get it in whatever order it wants.</p>
<ul>
<li><p class="first">A list of modified entries, and for each one:</p>
<ul class="simple">
<li>The stat values observed when the file was first read.</li>
<li>The hash of the committed file text.</li>
<li>The file’s last-change revision, if appropriate.</li>
</ul>
<p>This should include any entries automatically added or removed.</p>
</li>
</ul>
<p>This might be construed as an enhanced version of <code class="docutils literal notranslate"><span class="pre">set_parent_trees</span></code>.
We can avoid a stat on each file by using the value that was observed when
it was first read.</p>
</div>
<div class="section" id="selective-commit">
<h4><a class="toc-backref" href="#id126">1.3.4.13&nbsp;&nbsp;&nbsp;Selective commit</a><a class="headerlink" href="#selective-commit" title="Permalink to this headline">¶</a></h4>
<p>For a partial commit the directory contents may need to contain a mix of
entries from the working tree and parent trees.  This code probably
shouldn’t live in a specific tree implementation; maybe there should be a
general filter that selects paths from one tree into another?</p>
<p>However, the tree walking code does probably need to know about selected
paths to avoid examining unselected files or directories.</p>
<p>We never refuse selective file commits (except of merges).</p>
</div>
<div class="section" id="common-commit-code">
<h4><a class="toc-backref" href="#id127">1.3.4.14&nbsp;&nbsp;&nbsp;Common commit code</a><a class="headerlink" href="#common-commit-code" title="Permalink to this headline">¶</a></h4>
<p>What is common to all commit implementations, regardless of workingtree or
repository format?</p>
<ul class="simple">
<li>Prompting for a commit message?</li>
<li>Strictness/conflict checks?</li>
<li>Auto add/remove?</li>
</ul>
<p>How should this be separated?</p>
</div>
<div class="section" id="order-of-traversal">
<h4><a class="toc-backref" href="#id128">1.3.4.15&nbsp;&nbsp;&nbsp;Order of traversal</a><a class="headerlink" href="#order-of-traversal" title="Permalink to this headline">¶</a></h4>
<p>For current and contemplated Bazaar storage formats, we can only finally
commit a directory after its contained files and directories have been
committed.</p>
<p>The dirstate workingtree format naturally iterates by directory in order
by path, yielding directories before their contents.  This may also be the
most efficient order in which to stat and read the files.</p>
<p>One option would be to construe the interface as a visitor which reports
when files are detected to be changed, and also when directories are
finished.</p>
</div>
<div class="section" id="open-question-per-file-graphs">
<h4><a class="toc-backref" href="#id129">1.3.4.16&nbsp;&nbsp;&nbsp;Open question: per-file graphs</a><a class="headerlink" href="#open-question-per-file-graphs" title="Permalink to this headline">¶</a></h4>
<p><strong>XXX:</strong> If we want to retain explicitly stored per-file graphs, it would
seem that we do need to record per-file parents.  We have not yet finally
settled that we do want to remove them or treat them as a cache.  This api
stack is still ok whether we do or not, but the internals of it may
change.</p>
</div>
</div>
<div class="section" id="diff-performance-analysis">
<h3><a class="toc-backref" href="#id75">1.3.5&nbsp;&nbsp;&nbsp;diff Performance Analysis</a><a class="headerlink" href="#diff-performance-analysis" title="Permalink to this headline">¶</a></h3>
<div class="contents local topic" id="id6">
<ul class="auto-toc simple">
<li><a class="reference internal" href="#minimal-work" id="id130">1.3.5.1&nbsp;&nbsp;&nbsp;Minimal Work</a><ul class="auto-toc">
<li><a class="reference internal" href="#reuse-of-historical-comparisons" id="id131">1.3.5.1.1&nbsp;&nbsp;&nbsp;Reuse of historical comparisons</a></li>
<li><a class="reference internal" href="#historical-tree-against-historical-tree" id="id132">1.3.5.1.2&nbsp;&nbsp;&nbsp;Historical Tree Against Historical Tree</a></li>
<li><a class="reference internal" href="#basis-against-historical-tree" id="id133">1.3.5.1.3&nbsp;&nbsp;&nbsp;Basis Against Historical Tree</a></li>
<li><a class="reference internal" href="#basis-against-basis" id="id134">1.3.5.1.4&nbsp;&nbsp;&nbsp;Basis Against Basis</a></li>
<li><a class="reference internal" href="#working-tree-against-basis" id="id135">1.3.5.1.5&nbsp;&nbsp;&nbsp;Working Tree Against Basis</a></li>
<li><a class="reference internal" href="#working-tree-against-historical-tree" id="id136">1.3.5.1.6&nbsp;&nbsp;&nbsp;Working Tree Against Historical Tree</a></li>
<li><a class="reference internal" href="#working-tree-against-working-tree" id="id137">1.3.5.1.7&nbsp;&nbsp;&nbsp;Working Tree Against Working Tree</a></li>
</ul>
</li>
<li><a class="reference internal" href="#api-changes" id="id138">1.3.5.2&nbsp;&nbsp;&nbsp;API Changes</a></li>
<li><a class="reference internal" href="#storage-considerations" id="id139">1.3.5.3&nbsp;&nbsp;&nbsp;Storage considerations</a></li>
</ul>
</div>
<div class="section" id="minimal-work">
<h4><a class="toc-backref" href="#id130">1.3.5.1&nbsp;&nbsp;&nbsp;Minimal Work</a><a class="headerlink" href="#minimal-work" title="Permalink to this headline">¶</a></h4>
<div class="section" id="reuse-of-historical-comparisons">
<h5><a class="toc-backref" href="#id131">1.3.5.1.1&nbsp;&nbsp;&nbsp;Reuse of historical comparisons</a><a class="headerlink" href="#reuse-of-historical-comparisons" title="Permalink to this headline">¶</a></h5>
<p>A significant part of the work done by diff is sequence matching.  This
scales O(n^2) with the number of lines in the file.  Therefore, it
is worthwile to avoid content comparisons as much as possible.</p>
<p>Our current knit format contains content comparisons, and this data can
be converted into lists of matching blocks.  Other future formats such as
mpdiff may also support such conversion.  So it is possible to reuse past
comparisons.</p>
<p>It is also possible to combine sequential comparisons.  So given a comparison
of “foo” to “bar”, and “bar” to “baz”, it is possible to derive a comparison of
“foo” to “baz”.</p>
<p>Reuse of historical comparisons will scale with the number of uncommon
build-parents between the two historical revisions.  This will typically be
proportional to the amount of change that the file has undergone.  Therefore,
in the common case, reuse of historical comparisons will scale with the
amount of change.</p>
<p>The downside of such reuse is that it ties the comparison to the historical
data.  But given the performance improvement, it seems to be worth
consideration.  Fresh comparisons can be performed if the user requests them.</p>
<p>It may also be possible to accelerate comparisons by including annotation data,
thus increasing the number of unique lines.</p>
</div>
<div class="section" id="historical-tree-against-historical-tree">
<h5><a class="toc-backref" href="#id132">1.3.5.1.2&nbsp;&nbsp;&nbsp;Historical Tree Against Historical Tree</a><a class="headerlink" href="#historical-tree-against-historical-tree" title="Permalink to this headline">¶</a></h5>
<p>This operation should be strictly proportional to the amount of change, because
a comparison has already been done at commit time.  Achieving that performance
requires the committed data to be properly structured, so that the comparison
can be extracted and combined with other comparisons.  This comparision
extraction should be possible at the inventory and file-content levels.</p>
<p>Minimum work:</p>
<ol class="arabic simple">
<li>Extract and combine inventory comparisons</li>
<li>Extract and combine text comparisions for modified texts</li>
</ol>
</div>
<div class="section" id="basis-against-historical-tree">
<h5><a class="toc-backref" href="#id133">1.3.5.1.3&nbsp;&nbsp;&nbsp;Basis Against Historical Tree</a><a class="headerlink" href="#basis-against-historical-tree" title="Permalink to this headline">¶</a></h5>
<p>This is another case of Historical Tree Against Historical Tree.</p>
</div>
<div class="section" id="basis-against-basis">
<h5><a class="toc-backref" href="#id134">1.3.5.1.4&nbsp;&nbsp;&nbsp;Basis Against Basis</a><a class="headerlink" href="#basis-against-basis" title="Permalink to this headline">¶</a></h5>
<p>This is another case of Historical Tree Against Historical Tree.</p>
</div>
<div class="section" id="working-tree-against-basis">
<h5><a class="toc-backref" href="#id135">1.3.5.1.5&nbsp;&nbsp;&nbsp;Working Tree Against Basis</a><a class="headerlink" href="#working-tree-against-basis" title="Permalink to this headline">¶</a></h5>
<p>This must scale with the number of versioned files, unless the user indicates
that only certain files should be compared.</p>
<p>Performance can be further improved by caching comparisons to avoid repeating
them.  Caching could potentially be performed by <code class="docutils literal notranslate"><span class="pre">diff</span></code> and perhaps by
<code class="docutils literal notranslate"><span class="pre">merge</span></code>.  Merge is aware of the relationship of a text merge’s result to
the THIS value, and the THIS value is generally the basis value.  So the
comparison is latent, but present.  The only issue is extracting it.</p>
<p>The cache could be indexed by sha1sum pairs.  It could also be indexed by
file-id, to facilitate removal of stale data.</p>
<p>Minimum work:</p>
<ol class="arabic simple">
<li>Scan working tree for modified files</li>
<li>Retrieve cached comparisons</li>
<li>Perform comparisons on files with no cached comparisons</li>
<li>Cache comparisons for files with no cached comparisons</li>
</ol>
</div>
<div class="section" id="working-tree-against-historical-tree">
<h5><a class="toc-backref" href="#id136">1.3.5.1.6&nbsp;&nbsp;&nbsp;Working Tree Against Historical Tree</a><a class="headerlink" href="#working-tree-against-historical-tree" title="Permalink to this headline">¶</a></h5>
<p>This can be structured as a comparison of working tree against basis tree,
followed by basis tree against historical tree.  Therefore, it combines the
performance characteristics of “Working Tree Against Basis” with “Basis Against
Historical Tree”.</p>
</div>
<div class="section" id="working-tree-against-working-tree">
<h5><a class="toc-backref" href="#id137">1.3.5.1.7&nbsp;&nbsp;&nbsp;Working Tree Against Working Tree</a><a class="headerlink" href="#working-tree-against-working-tree" title="Permalink to this headline">¶</a></h5>
<p>This can be structured as two comparisons against basis, and one comparison
of basis against basis.  Its performance is therefore similar to Working Tree
Against Historical Tree.</p>
</div>
</div>
<div class="section" id="api-changes">
<h4><a class="toc-backref" href="#id138">1.3.5.2&nbsp;&nbsp;&nbsp;API Changes</a><a class="headerlink" href="#api-changes" title="Permalink to this headline">¶</a></h4>
<p>Desired API:</p>
<blockquote>
<div><ul class="simple">
<li>Tree.get_comparision(file_id, tree)</li>
</ul>
</div></blockquote>
<p>This probably entails:</p>
<blockquote>
<div><ul class="simple">
<li>WorkingTree.store_comparison(file_id, revision_id, sha1, comparison)</li>
<li>WorkingTree.get_comparison(file_id, revision_id, sha1)</li>
<li>Repository.get_comparision(file_id, revision_id, revision_id)</li>
<li>merge_comparisions(comparison, comparision)</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="storage-considerations">
<h4><a class="toc-backref" href="#id139">1.3.5.3&nbsp;&nbsp;&nbsp;Storage considerations</a><a class="headerlink" href="#storage-considerations" title="Permalink to this headline">¶</a></h4>
<p>It must be cheap (e.g. scale with number of intermediate revisions) to perform
comparison of two historical texts.  It must be cheap to perform comparison of
the inventories of two historical trees.</p>
</div>
</div>
<div class="section" id="garbage-collection">
<h3><a class="toc-backref" href="#id86">1.3.6&nbsp;&nbsp;&nbsp;Garbage Collection</a><a class="headerlink" href="#garbage-collection" title="Permalink to this headline">¶</a></h3>
<p>Garbage collection is used to remove data from a repository that is no longer referenced.</p>
<p>Generally this involves locking the repository and scanning all its branches
then generating a new repository with less data.</p>
<div class="section" id="id7">
<h4><a class="toc-backref" href="#id87">1.3.6.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a><a class="headerlink" href="#id7" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>Read all branches to get initial references - tips + tags.</li>
<li>Read through the revision graph to find unreferenced revisions. A cheap HEADS
list might help here by allowing comparison of the initial references to the
HEADS - any unreferenced head is garbage.</li>
<li>Walk out via inventory deltas to get the full set of texts and signatures to preserve.</li>
<li>Copy to a new repository</li>
<li>Bait and switch back to the original</li>
<li>Remove the old repository.</li>
</ul>
<p>A possibility to reduce this would be to have a set of grouped ‘known garbage
free’ data - ‘ancient history’ which can be preserved in total should its HEADS
be fully referenced - and where the HEADS list is deliberate cheap (e.g. at the
top of some index).</p>
<p>possibly - null data in place without saving size.</p>
</div>
</div>
<div class="section" id="revert">
<h3><a class="toc-backref" href="#id88">1.3.7&nbsp;&nbsp;&nbsp;Revert</a><a class="headerlink" href="#revert" title="Permalink to this headline">¶</a></h3>
<p>Change users selected paths to be the same as those in a given revision making
backups of any paths that bzr did not set the last contents itself.</p>
<div class="section" id="id8">
<h4><a class="toc-backref" href="#id89">1.3.7.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a><a class="headerlink" href="#id8" title="Permalink to this headline">¶</a></h4>
<p>We should be able to do work proportional to the scope the user is reverting
and the amount of changes between the working tree and the revision being
reverted to.</p>
<p>This depends on being able to compare unchanged subtrees without recursing so that the mapping of paths to revert to ids to revert can be done efficiently. Specifically we should be able to avoid getting the transitive closure of directory contents when mapping back to paths from ids at the start of revert.</p>
<p>One way this might work is to:
for the selected scopes, for each element in the wt:</p>
<blockquote>
<div>1. get hash tree data for that scope.
1. get ‘new enough’ hash data for the siblings of the scope: it can be out of date as long as it’s not older than the last move or rename out of that siblings scope.
1. Use the hash tree data to tune the work done in finding matching paths/ids which are different in the two trees.</div></blockquote>
<p>For each thing that needs to change - group by target directory?</p>
<blockquote>
<div>1. Extract new content.
1. Backup old content or replace-in-place (except windows where we move and replace).</div></blockquote>
</div>
</div>
<div class="section" id="the-status-command">
<h3><a class="toc-backref" href="#id90">1.3.8&nbsp;&nbsp;&nbsp;The status command</a><a class="headerlink" href="#the-status-command" title="Permalink to this headline">¶</a></h3>
<p>The status command is used to provide a pithy listing of the changes between
two trees. Its common case is between the working tree and the basis tree, but
it can be used between any two arbitrary trees.</p>
<div class="contents local topic" id="id9">
<ul class="auto-toc simple">
<li><a class="reference internal" href="#ui-overview" id="id140">1.3.8.1&nbsp;&nbsp;&nbsp;UI Overview</a></li>
<li><a class="reference internal" href="#ideal-work-for-working-tree-to-historical-status" id="id141">1.3.8.2&nbsp;&nbsp;&nbsp;Ideal work for working tree to historical status</a></li>
<li><a class="reference internal" href="#locality-of-reference" id="id142">1.3.8.3&nbsp;&nbsp;&nbsp;Locality of reference</a></li>
<li><a class="reference internal" href="#scaling-observations" id="id143">1.3.8.4&nbsp;&nbsp;&nbsp;Scaling observations</a></li>
</ul>
</div>
<div class="section" id="ui-overview">
<h4><a class="toc-backref" href="#id140">1.3.8.1&nbsp;&nbsp;&nbsp;UI Overview</a><a class="headerlink" href="#ui-overview" title="Permalink to this headline">¶</a></h4>
<p>Status shows several things in parallel (for the paths the user supplied mapped
across the from and to tree, and any pending merges in the to tree).</p>
<ol class="arabic simple">
<li>Single line summary of all new revisions - the pending merges and their
parents recursively.</li>
<li>Changes to the tree shape - adds/deletes/renames.</li>
<li>Changes to versioned content - kind changes and content changes.</li>
<li>Unknown files in the to tree.</li>
<li>Files with conflicts in the to tree.</li>
</ol>
</div>
<div class="section" id="ideal-work-for-working-tree-to-historical-status">
<h4><a class="toc-backref" href="#id141">1.3.8.2&nbsp;&nbsp;&nbsp;Ideal work for working tree to historical status</a><a class="headerlink" href="#ideal-work-for-working-tree-to-historical-status" title="Permalink to this headline">¶</a></h4>
<p>We need to do the following things at a minimum:</p>
<ol class="arabic simple">
<li>Determine new revisions - the pending merges and history.</li>
</ol>
<ol class="arabic simple">
<li>Retrieve the first line of the commit message for the new revisions.</li>
</ol>
<ol class="arabic simple">
<li>Determine the tree differences between the two trees using the users paths
to limit the scope, and resolving paths in the trees for any pending merges.
We arguably don’t care about tracking metadata for this - only the value of
the tree the user commited.</li>
</ol>
<ol class="arabic simple">
<li>The entire contents of directories which are versioned when showing
unknowns.</li>
</ol>
<ol class="arabic simple">
<li>Whether a given unversioned path is unknown or ignored.</li>
</ol>
<ol class="arabic simple">
<li>The list conflicted paths in the tree (which match the users path
selection?)</li>
</ol>
<p>Expanding on the tree difference case we will need to:</p>
<ol class="arabic simple">
<li>Stat every path in working trees which is included by the users path
selection to ascertain kind and execute bit.</li>
</ol>
<ol class="arabic simple">
<li>For paths which have the same kind in both trees and have content, read
that content or otherwise determine whether the content has changed. Using
our hash cache from the dirstate allows us to avoid reading the file in the
common case. There are alternative ways to achieve this - we could record
a pointer to a revision which contained this fileid with the current content
rather than storing the content’s hash; but this seems to be a pointless
double-indirection unless we save enough storage in the working tree. A
variation of this is to not record an explicit pointer but instead
define an implicit pointer as being to the left-hand-parent tree.</li>
</ol>
</div>
<div class="section" id="locality-of-reference">
<h4><a class="toc-backref" href="#id142">1.3.8.3&nbsp;&nbsp;&nbsp;Locality of reference</a><a class="headerlink" href="#locality-of-reference" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>We should stat files in the same directory without reading or statting
files in other directories. That is we should do all the statting we
intend to do within a given directory without doing any other IO, to
minimise pressure on the drive heads to seek.</li>
<li>We should read files in the same directory without reading or writing
files in other directories - and note this is separate to statting (file
data is usually physically disjoint to metadata).</li>
</ul>
</div>
<div class="section" id="scaling-observations">
<h4><a class="toc-backref" href="#id143">1.3.8.4&nbsp;&nbsp;&nbsp;Scaling observations</a><a class="headerlink" href="#scaling-observations" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>The stat operation clearly involves every versioned path in the common case.</li>
<li>Expanding out the users path selection in a naive manner involves reading the
entire tree shape information for both trees and for all pending-merge trees.
(Dirstate makes this tolerably cheap for now, but we’re still scaling
extra-linearly.)</li>
<li>The amount of effort required to generate tree differences between the
working tree and the basis tree is interesting: with a tree-like structure
and some generatable name for child nodes we use the working tree data to
eliminate accessing or considering subtrees regardless of historival
age. However, if we have had to access the historical tree shape to
perform path selection this rather reduces the win we can obtain here.
If we can cause path expansion to not require historical shape access
(perhaps by performing the expansion after calculating the tree
difference for the top level of the selected path) then we can gain a
larger win. This strongly suggests that path expansion and tree
difference generation should be linked in terms of API.</li>
</ul>
</div>
</div>
<div class="section" id="annotate">
<h3><a class="toc-backref" href="#id95">1.3.9&nbsp;&nbsp;&nbsp;Annotate</a><a class="headerlink" href="#annotate" title="Permalink to this headline">¶</a></h3>
<p>Broadly tries to ascribe parts of the tree state to individual commits.</p>
<p>There appear to be three basic ways of generating annotations:</p>
<p>If the annotation works by asking the storage layer for successive full texts
then the scaling of this will be proportional to the time to diff throughout
the history of thing being annotated.</p>
<p>If the annotation works by asking the storage layer for successive deltas
within the history of the thing being annotated we believe we can make it scale
broadly proportional to the depth of the tree of revisions of the annotated
object.</p>
<p>If the annotation works by combining cached annotations such that creating a
full text recreates annotations for it then it will scale with the cost of
obtaining that text.</p>
<p>Generally we want our current annotations but it would be nice to be able to do
whitespace annotations and potentially other diff based annotations.</p>
<p>Some things to think about:</p>
<blockquote>
<div><ul class="simple">
<li>Perhaps multiparent deltas would allow us to not store the cached
annotations in each delta without losing performance or accuracy.</li>
</ul>
</div></blockquote>
</div>
<div class="section" id="scaling-analysys-of-merge">
<h3><a class="toc-backref" href="#id96">1.3.10&nbsp;&nbsp;&nbsp;Scaling analysys of Merge</a><a class="headerlink" href="#scaling-analysys-of-merge" title="Permalink to this headline">¶</a></h3>
<ol class="arabic simple">
<li>Fetch revisions O(a)</li>
<li>Common Ancestor [O(b)] <strong>O(h)</strong></li>
<li>Calculate tree merge O(c) [+ O(b) + O(d)] <strong>+ O(i)</strong></li>
</ol>
<blockquote>
<div><ul class="simple">
<li>text merge O(e * e * f) + O(b)</li>
</ul>
</div></blockquote>
<ol class="arabic simple" start="4">
<li>Find filesystem conflicts O(c)</li>
<li>Resolve filesystem conflicts O(g)</li>
<li>Apply changes O(c) + O(log(d))</li>
<li>Set pending merges O(1)</li>
<li>Print conflicts O(g)</li>
<li>Print changes O(c)</li>
</ol>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">a:</th><td class="field-body">revisions missing from repo:</td>
</tr>
<tr class="field-even field"><th class="field-name">b:</th><td class="field-body">nodes in the revision graph:</td>
</tr>
<tr class="field-odd field"><th class="field-name">c:</th><td class="field-body">files that differ between base and other:</td>
</tr>
<tr class="field-even field"><th class="field-name">d:</th><td class="field-body">number of files in the tree</td>
</tr>
<tr class="field-odd field"><th class="field-name">e:</th><td class="field-body">number of lines in the text</td>
</tr>
<tr class="field-even field"><th class="field-name">f:</th><td class="field-body">number of files requiring text merge</td>
</tr>
<tr class="field-odd field"><th class="field-name">g:</th><td class="field-body">number of conflicts (g &lt;= c)</td>
</tr>
<tr class="field-even field"><th class="field-name">h:</th><td class="field-body">humber of uncommon ancestors</td>
</tr>
<tr class="field-odd field"><th class="field-name">i:</th><td class="field-body">number of revisions between base and other</td>
</tr>
</tbody>
</table>
<div class="section" id="needs">
<h4><a class="toc-backref" href="#id97">1.3.10.1&nbsp;&nbsp;&nbsp;Needs</a><a class="headerlink" href="#needs" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>Access to revision graph proportional to number of revisions read</li>
<li>Access to changed file metadata proportional to number of changes and number of intervening revisions.</li>
<li>O(1) access to fulltexts</li>
</ul>
</div>
<div class="section" id="notes">
<h4><a class="toc-backref" href="#id98">1.3.10.2&nbsp;&nbsp;&nbsp;Notes</a><a class="headerlink" href="#notes" title="Permalink to this headline">¶</a></h4>
<p>Multiparent deltas may offer some nice properties for performance of annotation based merging.</p>
</div>
</div>
<div class="section" id="bundle-creation">
<h3><a class="toc-backref" href="#id99">1.3.11&nbsp;&nbsp;&nbsp;Bundle Creation</a><a class="headerlink" href="#bundle-creation" title="Permalink to this headline">¶</a></h3>
<ol class="arabic simple">
<li>Find common ancestor [O(a)] <strong>O(b)</strong></li>
<li>Emit bundle [O(a)] <strong>O(b) O(h)</strong></li>
</ol>
<blockquote>
<div><p>Per revision</p>
<ol class="arabic simple">
<li>emit metadata O(1)</li>
<li>emit changes for files</li>
</ol>
<blockquote>
<div><ol class="arabic simple">
<li>find changed files [O(c)] <strong>O(f)</strong></li>
<li>emit file metadata O(d)</li>
<li>emit diff [O(e * e) * O(f) + O(h)] <strong>O(i)</strong></li>
<li>base64 encode O(g)</li>
</ol>
</div></blockquote>
</div></blockquote>
<ol class="arabic simple" start="3">
<li><strong>emit overal diff (or maybe do interdiff) O(e * e) * O(f)</strong></li>
</ol>
<table class="docutils field-list" frame="void" rules="none">
<col class="field-name" />
<col class="field-body" />
<tbody valign="top">
<tr class="field-odd field"><th class="field-name">a:</th><td class="field-body">nodes in revision graph</td>
</tr>
<tr class="field-even field"><th class="field-name">b:</th><td class="field-body">number of descendants of common ancestor</td>
</tr>
<tr class="field-odd field"><th class="field-name">c:</th><td class="field-body">number of files in the tree</td>
</tr>
<tr class="field-even field"><th class="field-name">d:</th><td class="field-body">length of metadata</td>
</tr>
<tr class="field-odd field"><th class="field-name">e:</th><td class="field-body">number of lines</td>
</tr>
<tr class="field-even field"><th class="field-name">f:</th><td class="field-body">number of modified files</td>
</tr>
<tr class="field-odd field"><th class="field-name">g:</th><td class="field-body">length of diff</td>
</tr>
<tr class="field-even field"><th class="field-name">h:</th><td class="field-body">nodes in knit graph of modified files</td>
</tr>
<tr class="field-odd field"><th class="field-name">i:</th><td class="field-body">length of stored diff</td>
</tr>
</tbody>
</table>
<div class="section" id="id10">
<h4><a class="toc-backref" href="#id100">1.3.11.1&nbsp;&nbsp;&nbsp;Needs</a><a class="headerlink" href="#id10" title="Permalink to this headline">¶</a></h4>
<ul class="simple">
<li>Improved common ancestor algorithm</li>
<li>Access to partial revision graph proportional to relevant revisions</li>
<li>Access to changed files proportional to number of change files and
intervening revisions</li>
<li>Use knit deltas without recomputing</li>
<li>Access to knit deltas in O(1) time</li>
<li>Access to snapshots in O(1) amortized time</li>
<li>All snapshots must have knit deltas</li>
</ul>
</div>
</div>
<div class="section" id="uncommit-performance-notes">
<h3><a class="toc-backref" href="#id101">1.3.12&nbsp;&nbsp;&nbsp;Uncommit Performance Notes</a><a class="headerlink" href="#uncommit-performance-notes" title="Permalink to this headline">¶</a></h3>
<div class="section" id="specification-of-uncommit">
<h4><a class="toc-backref" href="#id102">1.3.12.1&nbsp;&nbsp;&nbsp;Specification of uncommit</a><a class="headerlink" href="#specification-of-uncommit" title="Permalink to this headline">¶</a></h4>
<p><code class="docutils literal notranslate"><span class="pre">uncommit</span></code> removes revisions from the head of a branch.  (By default, only
the very latest revision is removed, but optionally more can be taken.)
Uncommit does not affect the repository (garbage collection is a separate
step and not done by default).  The working tree is not logically
modified (revert is a different operation), except as described below
about merges.</p>
<p>Uncommit can be performed on either a branch or a working tree (and
implicitly its branch.)</p>
<p>If the uncommitted revisions includes one or more merges, after the
uncommit those revisions are in the working tree’s list of pending merges,
because their tree changes are still present in the tree.</p>
<p>For a bound branch, uncommit fails unless the local branch is up to date.</p>
</div>
</div>
<div class="section" id="missing">
<h3><a class="toc-backref" href="#id103">1.3.13&nbsp;&nbsp;&nbsp;Missing</a><a class="headerlink" href="#missing" title="Permalink to this headline">¶</a></h3>
<p>Missing is used to find out the differences between the current branch and
another branch.</p>
<p>The performance analysis itself brings no further points than the
incremental-push-pull one.</p>
<p>More importantly, the UI have been considered not optimal: missing finds and
displays the differences between two branches, presenting the revisions that
are not common to both branches as two sets:</p>
<ul class="simple">
<li>the revisions that are present only in the current branch,</li>
<li>the revisions that are present only in the other branch.</li>
</ul>
<p>A quick and dirty survey indicates that most of the users are interested in
only one set of revisions at a time.</p>
<p>From a performance point of view, it may be more appropriate to calculate only
the set the user is asking for.</p>
<p>It has been proposed that the missing command be deprecated in favor of a
–dry-run option for the push, pull, merge commands.</p>
<p>In the mean time, the missing command stays interesting as it provides an easy
way to test, measure and optimize graph differences processing.</p>
</div>
</div>
<div class="section" id="subsystem-designs">
<h2><a class="toc-backref" href="#id104">1.4&nbsp;&nbsp;&nbsp;Subsystem designs</a><a class="headerlink" href="#subsystem-designs" title="Permalink to this headline">¶</a></h2>
<div class="section" id="directory-fingerprints">
<h3><a class="toc-backref" href="#id105">1.4.1&nbsp;&nbsp;&nbsp;Directory fingerprints</a><a class="headerlink" href="#directory-fingerprints" title="Permalink to this headline">¶</a></h3>
<div class="contents local topic" id="id11">
<ul class="auto-toc simple">
<li><a class="reference internal" href="#introduction" id="id144">1.4.1.1&nbsp;&nbsp;&nbsp;Introduction</a></li>
<li><a class="reference internal" href="#use-case-oriented-apis" id="id145">1.4.1.2&nbsp;&nbsp;&nbsp;Use-case oriented APIs</a><ul class="auto-toc">
<li><a class="reference internal" href="#commit" id="id146">1.4.1.2.1&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">commit</span></code></a></li>
<li><a class="reference internal" href="#log" id="id147">1.4.1.2.2&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">log</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#open-questions" id="id148">1.4.1.3&nbsp;&nbsp;&nbsp;Open questions</a></li>
<li><a class="reference internal" href="#conclusions" id="id149">1.4.1.4&nbsp;&nbsp;&nbsp;Conclusions</a></li>
<li><a class="reference internal" href="#design-changes" id="id150">1.4.1.5&nbsp;&nbsp;&nbsp;Design changes</a></li>
<li><a class="reference internal" href="#id12" id="id151">1.4.1.6&nbsp;&nbsp;&nbsp;API changes</a></li>
</ul>
</div>
<div class="section" id="introduction">
<h4><a class="toc-backref" href="#id144">1.4.1.1&nbsp;&nbsp;&nbsp;Introduction</a><a class="headerlink" href="#introduction" title="Permalink to this headline">¶</a></h4>
<p>The basic idea is that for a directory in a tree (committed or otherwise), we
will have a single scalar value.  If these values are the same, the contents of
the subtree under that directory are necessarily the same.</p>
<p>This is intended to help with these use cases, by allowing them to quickly skip
over directories with no relevant changes, and to detect when a directory has
changed:</p>
<ul class="simple">
<li>diff/status (both local trees and historical trees)</li>
<li>merge</li>
<li>log -v</li>
<li>log on a directory</li>
<li>commit</li>
</ul>
</div>
<div class="section" id="use-case-oriented-apis">
<h4><a class="toc-backref" href="#id145">1.4.1.2&nbsp;&nbsp;&nbsp;Use-case oriented APIs</a><a class="headerlink" href="#use-case-oriented-apis" title="Permalink to this headline">¶</a></h4>
<p>Most of this will be hidden behind the Tree interface.  This should cover
<code class="docutils literal notranslate"><span class="pre">log</span> <span class="pre">-v</span></code>, <code class="docutils literal notranslate"><span class="pre">diff</span></code>, <code class="docutils literal notranslate"><span class="pre">status</span></code>, <code class="docutils literal notranslate"><span class="pre">merge</span></code> (and implicit merge during
push, pull, update):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">tree</span><span class="o">.</span><span class="n">iter_changes</span><span class="p">(</span><span class="n">other_tree</span><span class="p">)</span>
<span class="n">tree</span><span class="o">.</span><span class="n">get_file_lines</span><span class="p">(</span><span class="n">file_id</span><span class="p">)</span>   <span class="c1"># and get_file, get_file_text</span>
</pre></div>
</div>
<div class="section" id="commit">
<h5><a class="toc-backref" href="#id146">1.4.1.2.1&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">commit</span></code></a><a class="headerlink" href="#commit" title="Permalink to this headline">¶</a></h5>
<p>Commit is similar to <code class="docutils literal notranslate"><span class="pre">iter_changes</span></code>, but different because it needs to
compare to all the trees.  Commit currently needs to compare the working
tree to all the parent trees, which is needed to update the last_modified
field and would be unnecessary if we removed that field (for both files
and directories) and did not store per-file graphs.
This would potentially speed up commit after merge.</p>
<p>Verbose commit also displays the merged files, which does
require looking at all parents of files that aren’t identical
to the left-hand parent.</p>
</div>
<div class="section" id="log">
<h5><a class="toc-backref" href="#id147">1.4.1.2.2&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">log</span></code></a><a class="headerlink" href="#log" title="Permalink to this headline">¶</a></h5>
<p>Log is interested in two operations: finding the revisions that touched
anything inside a directory, and getting the differences between
consecutive revisions (possibly filtered to a directory):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">find_touching_revisions</span><span class="p">(</span><span class="n">branch</span><span class="p">,</span> <span class="n">file_id</span><span class="p">)</span> <span class="c1"># should be on Branch?</span>
</pre></div>
</div>
<p>Log shows the revisions that merged a change.  At the moment that is not
included in the per-file graph, and it would also not be visible if the
directories were hashed.</p>
</div>
</div>
<div class="section" id="open-questions">
<h4><a class="toc-backref" href="#id148">1.4.1.3&nbsp;&nbsp;&nbsp;Open questions</a><a class="headerlink" href="#open-questions" title="Permalink to this headline">¶</a></h4>
<ul>
<li><p class="first">Is this a good idea at all?</p>
<p>If changing a file changes all its parent directories up to the root it
will cause more churn on commit.  (We currently update the all-in-one
inventory, but only have to update one line of it.)</p>
<p>Every time a child changes, we’ll get a new node in the per-directory
graph.  This is generally useful: it allows bzr log to do the default
mode easily, which is to show all changes under that directory.  The
less common operation, <code class="docutils literal notranslate"><span class="pre">log</span> <span class="pre">--no-recursive</span></code> is still possible by
looking only at when the directory itself was renamed, added or removed.
(That is what the directory graph describes in bzr 0.18 and it is rarely
useful.)</p>
</li>
<li><p class="first">Should these be hashes or revision ids or something else?</p>
<p>Pros of using hashes: hashes are easy to generate by a foreign branch
plugin (e.g. bzr-svn).  They don’t need to get recursive last-changed
from the foreign branch, or to walk back through history.  They just
need the relevant directory state, which any system we support can
answer.</p>
<p>Hashes converge: if you modify and then modify back, you get the same
hash.  This is a pro because you can detect that there were ultimately
no significant changes.  And also a con: you cannot use these hashes to form a graph
because they get cycles.</p>
</li>
<li><p class="first">Are the values unique across the whole tree, or only when comparing
different versions of the same object?</p>
<p>If we use last-changed revisions, then they will be very not unique
across the whole tree.  To look up the contents, you must pass a
composite key like <code class="docutils literal notranslate"><span class="pre">(file_id,</span> <span class="pre">last_changed)</span></code>.</p>
<p>If we use hashes they will be same only when the two contain the same
contents.  Since we say that file ids must be unique, this
means they will match if and only if they are empty.  We might relax
that in future when we introduce path tokens.</p>
</li>
<li><p class="first">Is it reasonable to assume hashes won’t collide?</p>
<p>The odds of SHA-1 hashes colliding “accidentally” are vanishingly small.</p>
<p>It is possible that a <a class="reference external" href="http://tools.ietf.org/html/rfc4270">preimage attack</a> against SHA-1 may be discovered
in the future.  Since we’re not proposing in this document to make
revision-ids be SHA-1, if SHA-1 was obsoleted then we could rewrite the
contents of revisions but would not need to rename revisions.  So the
impact of such a migration should just be a format upgrade, and a
recommendation (but not requirement) to re-sign revisions.</p>
</li>
</ul>
<ul>
<li><p class="first">If we use hashes, should it be the hash of the
representation stored for a directory?</p>
<p>In other words, should we pun the representation of the directory with
the form used for validation.</p>
<p>If there’s some data stored that’s not in the hash it’s problematic.
The hash in no longer (effectively) uniquely identifies the
representation.</p>
<p>It is desirable that we have a hash that covers all data, to guard
against bugs, transmission errors, or users trying to hand-hack files.
Since we need one hash of everything in the tree, perhaps we should also
use it for the fingerprint.</p>
<p>Testaments explicitly separate the form used for hashing/signing from
the form used for storage.  This allows us to change the storage form
without breaking existing GPG signatures.  The downside is that we need
to do work O(tree) to make a testament, and this slows down signing,
verifying and generating bundles.  It also means that there is some
stored data which is not protected by the signature: this data is less
important, but corruption of it would still cause problems.
We have encountered some specific problems with disagreement between
inventories as to the last-change of files, which is currently unsigned.
These problems can be introduced by ghosts.</p>
<p>If we hash the representation, there is still a way to support old
signatures, assuming that we never discard irreplaceable information.
The signature should say what format it applies to (similar to
testaments), and we could transform in memory the tree back to that
format.</p>
</li>
<li><p class="first">Is hashing substantially slower than other possible approaches?</p>
<p>We already hash all the plain files.  Except in unusual cases, the
directory metadata will be substantially smaller: perhaps 200:1 as a
rule of thumb.</p>
<p>When building a bzr tree, we spend on the order of 100ms hashing all the
source lines to validate them (about 13MB of source).</p>
</li>
<li><p class="first">Can you calculate one from a directory in the working tree?  Without a basis?</p>
<p>This seems possible with either hashes or revision ids.</p>
<p>Using last_changed means that calculating the fingerprint from a working
tree necessarily requires reading the inventory for the basis
revision, so that we know when unchanged files were last changed.  With
hashes we could calculate them using the working tree information alone.
It’s true that we will often then compare that information to the basis
tree (e.g. for simple <code class="docutils literal notranslate"><span class="pre">bzr</span> <span class="pre">diff</span></code>), but we may only have to compare at
the top level, and sometimes we’re comparing to a
different tree.  This also touches on whether we should store
<code class="docutils literal notranslate"><span class="pre">last_modified</span></code> for files, rather than directories.</p>
<p>For revision ids we need to assign a value to use for uncommitted
changes, but see below about the problems of this.</p>
<p>In some ways it would be elegant to say (hypothetical):</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">wt</span><span class="o">.</span><span class="n">get_root</span><span class="p">()</span><span class="o">.</span><span class="n">get_last_modified</span><span class="p">()</span> <span class="o">==</span> <span class="n">branch</span><span class="o">.</span><span class="n">get_last_revision</span><span class="p">()</span>
</pre></div>
</div>
<p>to know that nothing was changed; but this may not be much better than</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">wt</span><span class="o">.</span><span class="n">get_root</span><span class="p">()</span><span class="o">.</span><span class="n">get_hash</span><span class="p">()</span> <span class="o">==</span>
  <span class="n">branch</span><span class="o">.</span><span class="n">get_basis</span><span class="p">()</span><span class="o">.</span><span class="n">get_root</span><span class="p">()</span><span class="o">.</span><span class="n">get_hash</span><span class="p">()</span>
</pre></div>
</div>
</li>
<li><p class="first">Can you use this to compare (directories from) two working trees?</p>
<p>If you can generate it from a working tree, you should be able to use it
to compare them.</p>
<p>This does rule out for example using <code class="docutils literal notranslate"><span class="pre">last_modified=None</span></code> or
<code class="docutils literal notranslate"><span class="pre">='current:'</span></code> to mean “changed in the working tree.”  Even if this is
not supported there seems some risk that we would get the same
fingerprint for trees that are actually different.</p>
<p>We could assign a
hypothetical revision id to the tree for uncommitted files.  In that
case there is some risk that the not-yet-committed id would become
visible or committed.</p>
</li>
<li><p class="first">Can we use an “approximate basis”?</p>
<p>When using radix trees, you may need context beyond the specific
directory being compared.</p>
</li>
<li><p class="first">Can you get the fingerprint of parents directories with only selected file ids
taken from the working tree?</p>
<p>With hashes, we’d want to carry through the unselected files and
directories from the values they had in the parent revision.</p>
</li>
<li><p class="first">Are unbalanced trees a significant problem?  Trees can be unbalanced by having
many directories (deep or wide), or many files per directory.</p>
<p>For small trees like bzr, 744 of 874 are in the bzrlib subtree.  In
general, larger trees are more balanced, because humans, editors and
other tools have trouble managing very unbalanced trees.  But there are
exceptions: Aaron has one tree with 20,000 generated but versioned
entries in one directory.</p>
</li>
<li><p class="first">Should we use a radix tree approach where fingerprints are calculated on a synthetic
tree that is by definition balanced, even when the actual tree is unbalanced?</p>
</li>
<li><p class="first">What are the specific advantages of using recursive-last-modified rather than
hashes?</p>
<p>It may be a smaller step change.</p>
<p>It’s a bidirectional link: given a directory text identifier <code class="docutils literal notranslate"><span class="pre">(file_id,</span>
<span class="pre">last_changed)</span></code> you can look up the revision that last changed it.</p>
<p>From the preceding, even without the per-file graph you can skip through
the history of this file: go to the last-changed revision, look at all
its parents and repeat.</p>
</li>
<li><p class="first">Is it a smaller change to use recursive-last-modified on directories?</p>
<p>Probably yes:</p>
<ol class="arabic">
<li><p class="first">We can just put it into the current inventory format without changing
anything else.</p>
<p>By contrast to use a hash we’d have to either split up the inventory
as stored, or change the sort order for the inventory, or synthesize
per-directory inventories in memory for hashing.</p>
<p>However, XML is somewhat redundant and slow to parse/generate; and
reading the whole thing before comparing some sections is only a
partial win.  It may be a smaller change but we’d be preserving
things we want to change.</p>
</li>
</ol>
<ol class="arabic simple">
<li>At present we rarely hash storage representations, only file texts.
This is not a large technical change, but it is a conceptual change.
This has some consequences for how we can upgrade it in future: all
the changed directories need to be rewritten up to the revision level.</li>
</ol>
<ol class="arabic simple">
<li>If we address directories by hash we need hash-addressed
storage.</li>
</ol>
<ol class="arabic simple">
<li>If we address directories by hash then for consistency we’d probably
(not necessarily) want to address file texts by hash.</li>
</ol>
<ol class="arabic simple">
<li>The per-file graph can’t be indexed by hash because they can converge, so we
need to either rework or dispose of the per-file graph.</li>
</ol>
</li>
<li><p class="first">Any possibilities for avoiding hashes recurring?</p>
<ol class="arabic simple">
<li>Hash along with an identification of the parents (as in hg).  Then you
can’t convert a tree without all its basis trees, and there is still
convergence when the same merge is done by two people, and you can’t
create it directly from the working tree.</li>
</ol>
<ol class="arabic simple">
<li>Include last-modified revision id in the hash.</li>
</ol>
<ol class="arabic simple">
<li>Index by <code class="docutils literal notranslate"><span class="pre">(revision,</span> <span class="pre">hash)</span></code> or vice versa.</li>
</ol>
<ol class="arabic simple">
<li>Store a per-file graph and allow it to have repeated keys.  The graph
would tell you about all the parent texts ever seen; you would need
to use revision graph information to resolve ambiguities.</li>
</ol>
</li>
<li><p class="first">What are the specific disadvantages of using recursive-last-modified rather than
hashes?</p>
<p>To calculate the last-changed revision, given the last-changed
information of the contained files, you need to look at the revision
graph.  They’re not enough because you need to know the relations
between the mentioned revisions.  In a merge it’s possible the correct
directory last-modified will not be the same as that of any of the files
within it.  This can also happen when a file is removed (deleted or
renamed) from a directory.</p>
</li>
<li><p class="first">Should we split up storage of the inventories?</p>
<p>This is not quite the same but connected.</p>
</li>
<li><p class="first">How does this relate to per-file/per-directory hashes?</p>
<p>If the version of a file or directory is identified by a hash, we can’t
use that to point into a per-file graph.  We can have a graph indexed by
<code class="docutils literal notranslate"><span class="pre">(file_id,</span> <span class="pre">hash,</span> <span class="pre">revision_id)</span></code>.  The last-modified could be stored as
part of this graph.</p>
<p>The graph would no longer be core data; it could be always present but
might be rebuilt.  Treating it as non-core data may make some changes
like shallow branches easier?</p>
</li>
<li><p class="first">How do you ask a tree for a given text?</p>
<p>Right now we say</p>
<div class="highlight-default notranslate"><div class="highlight"><pre><span></span><span class="n">revision_tree</span><span class="o">.</span><span class="n">get_file_lines</span><span class="p">(</span><span class="n">file_id</span><span class="p">)</span>
</pre></div>
</div>
<p>so the choice of storage is hidden behind the revision tree: it could be
accessed by <code class="docutils literal notranslate"><span class="pre">(file_id,</span> <span class="pre">last_changed)</span></code> or by hash or otherwise.</p>
<p>At the moment the Repository exports a friend api to RevisionTree,
currently usually talking in VersionedFiles.</p>
<p>We probably wouldn’t want Repository to expose a <code class="docutils literal notranslate"><span class="pre">get_text_for_sha1()</span></code>
interface because that would be very difficult to support on old
repositories or on foreign branches.</p>
</li>
</ul>
</div>
<div class="section" id="conclusions">
<h4><a class="toc-backref" href="#id149">1.4.1.4&nbsp;&nbsp;&nbsp;Conclusions</a><a class="headerlink" href="#conclusions" title="Permalink to this headline">¶</a></h4>
</div>
<div class="section" id="design-changes">
<h4><a class="toc-backref" href="#id150">1.4.1.5&nbsp;&nbsp;&nbsp;Design changes</a><a class="headerlink" href="#design-changes" title="Permalink to this headline">¶</a></h4>
</div>
<div class="section" id="id12">
<h4><a class="toc-backref" href="#id151">1.4.1.6&nbsp;&nbsp;&nbsp;API changes</a><a class="headerlink" href="#id12" title="Permalink to this headline">¶</a></h4>
</div>
</div>
</div>
</div>


          </div>
        </div>
      </div>
      <div class="sphinxsidebar" role="navigation" aria-label="main navigation">
        <div class="sphinxsidebarwrapper">
  <h3><a href="index.html">Table of Contents</a></h3>
  <ul>
<li><a class="reference internal" href="#">1&nbsp;&nbsp;&nbsp;Bazaar Performance Roadmap</a><ul>
<li><a class="reference internal" href="#about-the-performance-roadmap">1.1&nbsp;&nbsp;&nbsp;About the performance roadmap</a><ul>
<li><a class="reference internal" href="#what-should-be-in-the-roadmap">1.1.1&nbsp;&nbsp;&nbsp;What should be in the roadmap?</a></li>
<li><a class="reference internal" href="#what-should-the-final-system-look-like-how-is-it-different-to-what-we-have-today">1.1.2&nbsp;&nbsp;&nbsp;What should the final system look like, how is it different to what we have today?</a></li>
<li><a class="reference internal" href="#what-use-cases-should-be-covered">1.1.3&nbsp;&nbsp;&nbsp;What use cases should be covered?</a></li>
<li><a class="reference internal" href="#how-is-development-on-the-roadmap-coordinated">1.1.4&nbsp;&nbsp;&nbsp;How is development on the roadmap coordinated?</a></li>
<li><a class="reference internal" href="#planned-changes-to-the-bzr-core">1.1.5&nbsp;&nbsp;&nbsp;Planned changes to the bzr core</a><ul>
<li><a class="reference internal" href="#library-changes">1.1.5.1&nbsp;&nbsp;&nbsp;Library changes</a></li>
<li><a class="reference internal" href="#interoperable-disk-changes">1.1.5.2&nbsp;&nbsp;&nbsp;Interoperable disk changes</a></li>
<li><a class="reference internal" href="#possibly-non-interoperable-disk-changes">1.1.5.3&nbsp;&nbsp;&nbsp;Possibly non-interoperable disk changes</a></li>
<li><a class="reference internal" href="#non-interoperable-disk-changes">1.1.5.4&nbsp;&nbsp;&nbsp;Non-interoperable disk changes</a></li>
</ul>
</li>
<li><a class="reference internal" href="#integration-of-performance-changes">1.1.6&nbsp;&nbsp;&nbsp;Integration of performance changes</a></li>
</ul>
</li>
<li><a class="reference internal" href="#analysis-of-use-cases">1.2&nbsp;&nbsp;&nbsp;Analysis of use cases</a><ul>
<li><a class="reference internal" href="#analysing-a-specific-use-case">1.2.1&nbsp;&nbsp;&nbsp;Analysing a specific use case</a></li>
<li><a class="reference internal" href="#performing-the-analysis">1.2.2&nbsp;&nbsp;&nbsp;Performing the analysis</a></li>
<li><a class="reference internal" href="#what-factors-should-be-considered">1.2.3&nbsp;&nbsp;&nbsp;What factors should be considered?</a></li>
</ul>
</li>
<li><a class="reference internal" href="#use-cases">1.3&nbsp;&nbsp;&nbsp;Use cases</a><ul>
<li><a class="reference internal" href="#initial-push-pull">1.3.1&nbsp;&nbsp;&nbsp;Initial push / pull</a><ul>
<li><a class="reference internal" href="#optimal-case">1.3.1.1&nbsp;&nbsp;&nbsp;Optimal case</a></li>
<li><a class="reference internal" href="#disk-case">1.3.1.2&nbsp;&nbsp;&nbsp;Disk case</a></li>
<li><a class="reference internal" href="#smart-network-case">1.3.1.3&nbsp;&nbsp;&nbsp;Smart Network Case</a><ul>
<li><a class="reference internal" href="#phase-1">1.3.1.3.1&nbsp;&nbsp;&nbsp;Phase 1</a></li>
<li><a class="reference internal" href="#phase-2">1.3.1.3.2&nbsp;&nbsp;&nbsp;Phase 2</a></li>
</ul>
</li>
<li><a class="reference internal" href="#dumb-network-case">1.3.1.4&nbsp;&nbsp;&nbsp;Dumb Network Case</a></li>
<li><a class="reference internal" href="#wants">1.3.1.5&nbsp;&nbsp;&nbsp;Wants</a></li>
</ul>
</li>
<li><a class="reference internal" href="#incremental-push-pull">1.3.2&nbsp;&nbsp;&nbsp;Incremental push/pull</a><ul>
<li><a class="reference internal" href="#functional-requirements">1.3.2.1&nbsp;&nbsp;&nbsp;Functional Requirements</a></li>
<li><a class="reference internal" href="#factors-which-should-add-work-for-push-pull">1.3.2.2&nbsp;&nbsp;&nbsp;Factors which should add work for push/pull</a></li>
<li><a class="reference internal" href="#push-pull-overview">1.3.2.3&nbsp;&nbsp;&nbsp;Push/pull overview</a><ul>
<li><a class="reference internal" href="#new-data-identification">1.3.2.3.1&nbsp;&nbsp;&nbsp;New data identification</a><ul>
<li><a class="reference internal" href="#set-synchronisation-approaches">1.3.2.3.1.1&nbsp;&nbsp;&nbsp;Set synchronisation approaches</a></li>
<li><a class="reference internal" href="#dag-synchronisation-approaches">1.3.2.3.1.2&nbsp;&nbsp;&nbsp;DAG synchronisation approaches</a></li>
<li><a class="reference internal" href="#file-level-scaling">1.3.2.3.1.3&nbsp;&nbsp;&nbsp;File level scaling</a></li>
<li><a class="reference internal" href="#api-scaling">1.3.2.3.1.4&nbsp;&nbsp;&nbsp;API scaling</a></li>
</ul>
</li>
<li><a class="reference internal" href="#data-reading">1.3.2.3.2&nbsp;&nbsp;&nbsp;Data reading</a><ul>
<li><a class="reference internal" href="#id1">1.3.2.3.2.1&nbsp;&nbsp;&nbsp;File level scaling</a></li>
<li><a class="reference internal" href="#id2">1.3.2.3.2.2&nbsp;&nbsp;&nbsp;API scaling</a></li>
</ul>
</li>
<li><a class="reference internal" href="#data-verification-and-writing">1.3.2.3.3&nbsp;&nbsp;&nbsp;Data Verification and writing</a><ul>
<li><a class="reference internal" href="#overview-summary">1.3.2.3.3.1&nbsp;&nbsp;&nbsp;Overview summary</a></li>
<li><a class="reference internal" href="#id3">1.3.2.3.3.2&nbsp;&nbsp;&nbsp;File level scaling</a></li>
<li><a class="reference internal" href="#id4">1.3.2.3.3.3&nbsp;&nbsp;&nbsp;API scaling</a></li>
</ul>
</li>
</ul>
</li>
<li><a class="reference internal" href="#notes-from-london">1.3.2.4&nbsp;&nbsp;&nbsp;Notes from London</a></li>
</ul>
</li>
<li><a class="reference internal" href="#add">1.3.3&nbsp;&nbsp;&nbsp;Add</a><ul>
<li><a class="reference internal" href="#least-work-we-can-hope-to-perform">1.3.3.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a></li>
<li><a class="reference internal" href="#per-file-algorithm">1.3.3.2&nbsp;&nbsp;&nbsp;Per file algorithm</a></li>
</ul>
</li>
<li><a class="reference internal" href="#commit-performance-notes">1.3.4&nbsp;&nbsp;&nbsp;Commit Performance Notes</a><ul>
<li><a class="reference internal" href="#changes-to-commit">1.3.4.1&nbsp;&nbsp;&nbsp;Changes to commit</a></li>
<li><a class="reference internal" href="#commit-the-minimum-work-required">1.3.4.2&nbsp;&nbsp;&nbsp;Commit: The Minimum Work Required</a></li>
<li><a class="reference internal" href="#commit-vs-status">1.3.4.3&nbsp;&nbsp;&nbsp;Commit vs Status</a></li>
<li><a class="reference internal" href="#avoiding-work-smarter-change-detection">1.3.4.4&nbsp;&nbsp;&nbsp;Avoiding Work: Smarter Change Detection</a></li>
<li><a class="reference internal" href="#avoiding-work-better-layering">1.3.4.5&nbsp;&nbsp;&nbsp;Avoiding Work: Better Layering</a></li>
<li><a class="reference internal" href="#avoiding-work-avoiding-reading-parent-data">1.3.4.6&nbsp;&nbsp;&nbsp;Avoiding work: avoiding reading parent data</a></li>
<li><a class="reference internal" href="#code-structure">1.3.4.7&nbsp;&nbsp;&nbsp;Code structure</a></li>
<li><a class="reference internal" href="#complications-of-commit">1.3.4.8&nbsp;&nbsp;&nbsp;Complications of commit</a></li>
<li><a class="reference internal" href="#interface-stack">1.3.4.9&nbsp;&nbsp;&nbsp;Interface stack</a></li>
<li><a class="reference internal" href="#branch-tree-interface">1.3.4.10&nbsp;&nbsp;&nbsp;Branch-&gt;Tree interface</a></li>
<li><a class="reference internal" href="#information-from-the-tree-to-repository">1.3.4.11&nbsp;&nbsp;&nbsp;Information from the tree to repository</a></li>
<li><a class="reference internal" href="#information-from-the-repository-to-the-tree">1.3.4.12&nbsp;&nbsp;&nbsp;Information from the repository to the tree</a></li>
<li><a class="reference internal" href="#selective-commit">1.3.4.13&nbsp;&nbsp;&nbsp;Selective commit</a></li>
<li><a class="reference internal" href="#common-commit-code">1.3.4.14&nbsp;&nbsp;&nbsp;Common commit code</a></li>
<li><a class="reference internal" href="#order-of-traversal">1.3.4.15&nbsp;&nbsp;&nbsp;Order of traversal</a></li>
<li><a class="reference internal" href="#open-question-per-file-graphs">1.3.4.16&nbsp;&nbsp;&nbsp;Open question: per-file graphs</a></li>
</ul>
</li>
<li><a class="reference internal" href="#diff-performance-analysis">1.3.5&nbsp;&nbsp;&nbsp;diff Performance Analysis</a><ul>
<li><a class="reference internal" href="#minimal-work">1.3.5.1&nbsp;&nbsp;&nbsp;Minimal Work</a><ul>
<li><a class="reference internal" href="#reuse-of-historical-comparisons">1.3.5.1.1&nbsp;&nbsp;&nbsp;Reuse of historical comparisons</a></li>
<li><a class="reference internal" href="#historical-tree-against-historical-tree">1.3.5.1.2&nbsp;&nbsp;&nbsp;Historical Tree Against Historical Tree</a></li>
<li><a class="reference internal" href="#basis-against-historical-tree">1.3.5.1.3&nbsp;&nbsp;&nbsp;Basis Against Historical Tree</a></li>
<li><a class="reference internal" href="#basis-against-basis">1.3.5.1.4&nbsp;&nbsp;&nbsp;Basis Against Basis</a></li>
<li><a class="reference internal" href="#working-tree-against-basis">1.3.5.1.5&nbsp;&nbsp;&nbsp;Working Tree Against Basis</a></li>
<li><a class="reference internal" href="#working-tree-against-historical-tree">1.3.5.1.6&nbsp;&nbsp;&nbsp;Working Tree Against Historical Tree</a></li>
<li><a class="reference internal" href="#working-tree-against-working-tree">1.3.5.1.7&nbsp;&nbsp;&nbsp;Working Tree Against Working Tree</a></li>
</ul>
</li>
<li><a class="reference internal" href="#api-changes">1.3.5.2&nbsp;&nbsp;&nbsp;API Changes</a></li>
<li><a class="reference internal" href="#storage-considerations">1.3.5.3&nbsp;&nbsp;&nbsp;Storage considerations</a></li>
</ul>
</li>
<li><a class="reference internal" href="#garbage-collection">1.3.6&nbsp;&nbsp;&nbsp;Garbage Collection</a><ul>
<li><a class="reference internal" href="#id7">1.3.6.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a></li>
</ul>
</li>
<li><a class="reference internal" href="#revert">1.3.7&nbsp;&nbsp;&nbsp;Revert</a><ul>
<li><a class="reference internal" href="#id8">1.3.7.1&nbsp;&nbsp;&nbsp;Least work we can hope to perform</a></li>
</ul>
</li>
<li><a class="reference internal" href="#the-status-command">1.3.8&nbsp;&nbsp;&nbsp;The status command</a><ul>
<li><a class="reference internal" href="#ui-overview">1.3.8.1&nbsp;&nbsp;&nbsp;UI Overview</a></li>
<li><a class="reference internal" href="#ideal-work-for-working-tree-to-historical-status">1.3.8.2&nbsp;&nbsp;&nbsp;Ideal work for working tree to historical status</a></li>
<li><a class="reference internal" href="#locality-of-reference">1.3.8.3&nbsp;&nbsp;&nbsp;Locality of reference</a></li>
<li><a class="reference internal" href="#scaling-observations">1.3.8.4&nbsp;&nbsp;&nbsp;Scaling observations</a></li>
</ul>
</li>
<li><a class="reference internal" href="#annotate">1.3.9&nbsp;&nbsp;&nbsp;Annotate</a></li>
<li><a class="reference internal" href="#scaling-analysys-of-merge">1.3.10&nbsp;&nbsp;&nbsp;Scaling analysys of Merge</a><ul>
<li><a class="reference internal" href="#needs">1.3.10.1&nbsp;&nbsp;&nbsp;Needs</a></li>
<li><a class="reference internal" href="#notes">1.3.10.2&nbsp;&nbsp;&nbsp;Notes</a></li>
</ul>
</li>
<li><a class="reference internal" href="#bundle-creation">1.3.11&nbsp;&nbsp;&nbsp;Bundle Creation</a><ul>
<li><a class="reference internal" href="#id10">1.3.11.1&nbsp;&nbsp;&nbsp;Needs</a></li>
</ul>
</li>
<li><a class="reference internal" href="#uncommit-performance-notes">1.3.12&nbsp;&nbsp;&nbsp;Uncommit Performance Notes</a><ul>
<li><a class="reference internal" href="#specification-of-uncommit">1.3.12.1&nbsp;&nbsp;&nbsp;Specification of uncommit</a></li>
</ul>
</li>
<li><a class="reference internal" href="#missing">1.3.13&nbsp;&nbsp;&nbsp;Missing</a></li>
</ul>
</li>
<li><a class="reference internal" href="#subsystem-designs">1.4&nbsp;&nbsp;&nbsp;Subsystem designs</a><ul>
<li><a class="reference internal" href="#directory-fingerprints">1.4.1&nbsp;&nbsp;&nbsp;Directory fingerprints</a><ul>
<li><a class="reference internal" href="#introduction">1.4.1.1&nbsp;&nbsp;&nbsp;Introduction</a></li>
<li><a class="reference internal" href="#use-case-oriented-apis">1.4.1.2&nbsp;&nbsp;&nbsp;Use-case oriented APIs</a><ul>
<li><a class="reference internal" href="#commit">1.4.1.2.1&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">commit</span></code></a></li>
<li><a class="reference internal" href="#log">1.4.1.2.2&nbsp;&nbsp;&nbsp;<code class="docutils literal notranslate"><span class="pre">log</span></code></a></li>
</ul>
</li>
<li><a class="reference internal" href="#open-questions">1.4.1.3&nbsp;&nbsp;&nbsp;Open questions</a></li>
<li><a class="reference internal" href="#conclusions">1.4.1.4&nbsp;&nbsp;&nbsp;Conclusions</a></li>
<li><a class="reference internal" href="#design-changes">1.4.1.5&nbsp;&nbsp;&nbsp;Design changes</a></li>
<li><a class="reference internal" href="#id12">1.4.1.6&nbsp;&nbsp;&nbsp;API changes</a></li>
</ul>
</li>
</ul>
</li>
</ul>
</li>
</ul>

  <h4>Previous topic</h4>
  <p class="topless"><a href="plans.html"
                        title="previous chapter">Plans</a></p>
  <h4>Next topic</h4>
  <p class="topless"><a href="new-config-rationale.html"
                        title="next chapter">Simplifying Bazaar Configuration</a></p>
  <div role="note" aria-label="source link">
    <h3>This Page</h3>
    <ul class="this-page-menu">
      <li><a href="_sources/performance-roadmap.txt"
            rel="nofollow">Show Source</a></li>
    </ul>
   </div>
<div id="searchbox" style="display: none" role="search">
  <h3>Quick search</h3>
    <div class="searchformwrapper">
    <form class="search" action="search.html" method="get">
      <input type="text" name="q" />
      <input type="submit" value="Go" />
      <input type="hidden" name="check_keywords" value="yes" />
      <input type="hidden" name="area" value="default" />
    </form>
    </div>
</div>
<script type="text/javascript">$('#searchbox').show(0);</script>
        </div>
      </div>
      <div class="clearer"></div>
    </div>
    <div class="related" role="navigation" aria-label="related navigation">
      <h3>Navigation</h3>
      <ul>
        <li class="right" style="margin-right: 10px">
          <a href="new-config-rationale.html" title="Simplifying Bazaar Configuration"
             >next</a></li>
        <li class="right" >
          <a href="plans.html" title="Plans"
             >previous</a> |</li>
<li><a href="http://bazaar.canonical.com/">
    <img src="_static/bzr icon 16.png" /> Home</a>&nbsp;|&nbsp;</li>
<a href="http://doc.bazaar.canonical.com/en/">Documentation</a>&nbsp;|&nbsp;</li>

        <li class="nav-item nav-item-0"><a href="index.html">Developer Document Catalog (2.7.0)</a> &#187;</li>

          <li class="nav-item nav-item-1"><a href="plans.html" >Plans</a> &#187;</li> 
      </ul>
    </div>
    <div class="footer" role="contentinfo">
        &#169; Copyright 2009-2011 Canonical Ltd.
      Created using <a href="http://sphinx-doc.org/">Sphinx</a> 1.8.4.
    </div>
  </body>
</html>