/* *********************************************************************
   | The Q language - A C++ extension for programming quantum machines |
   | Copyright (C) 2000 2001 2002 2003 Stefano Bettelli                |
   | <bettelli@irsamc.ups-tlse.fr>                                     |
   | See the COPYING and LICENSE files for license terms.              |
   ********************************************************************* */
#include "qoperator.h"                         // base class declaration
#include "qprimitives.h"                       // for Toffoli gates

/* this generates a Qop which calculates the AND of control qubits. */
static Qop generate_AND_circuit(Qop::size_type control_size);
/* this generates a Qop which replicates an element of the comput. basis */
static Qop generate_copy_circuit(Qop::size_type copy_size);

/* *********************************************************************
   | This constructor for a quantum operator accepts another operator  |
   | U and an integer number n and builds the n-controlled-U Qop C.    |
   | This is to say that if U operates on registers with k qubits,     |
   | then C operates on registers with n+k qubits, so that it is the   |
   | identity for all components but when the first n qubits are found |
   | in the |1> state.                                                 |
   | The additional complexity of running the controlled operation in  |
   | this approach is A log k + B log n (thus irrelevant with respect  |
   | to a probably polynomial (in k) complexity for U). Our notion of  |
   | complexity includes the parallelisation of homogeneous operations |
   | inside a single time slice. The additional space required scales  |
   | as n + k - 2 (slightly less than the size of the input register). |
   | ----------------------------------------------------------------- |
   | Step 1: If n>1, generate a circuit which performs the AND of the  |
   |         n qubits in the input register. This circuit must use a   |
   |         number of ancillae which scales linearly with n and it    |
   |         must perform the calculation in a number of time slices   |
   |         which scales with log(n).                                 |
   | Step 2: Calculate the parallelisation degree of "an_operator",    |
   |         i.e. the  maximum number of addresses in one of its time  |
   |         slices (say k). If k>1, generate another circuit which    |
   |         computes k copies of the single control qubit calculated  |
   |         during step 1 (this must take log(k) time).               |
   | Step 3: If necessary (k>1), synchronise the indexes of the qubit  |
   |         lines in the AND circuit and in the COPY circuit, by      |
   |         shifting down the former.                                 |
   | Step 4: send the preparation circuit, the passed operator and the |
   |         number of additional ancillae to a method of the slice    |
   |         list class which does the rest of the job (revise it).    |
   | ----------------------------------------------------------------- |
   | (01 Feb 2003) S.Bettelli, moved most of the work to the slice     |
   | list class, though the result is still unsatisfactory.            |
   | ----------------------------------------------------------------- |
   |                                                                   |
   |  old_ancillae         -----U-------   Schema for qubit lines:     |
   |  parallelisation - 1  ---C-X-C*----   AA is the AND circuit       |
   |  control_size - 1     -A-C---C*-A*-   CC is the COPY circuit      |
   |  control_size         -A--------A*-   * means conjugation         |
   |  register size        -----U-------   X-U means ctrl slices of U  |
   |                                           using X as controls.    |
   |                                                                   |
   | Stefano Bettelli, INFN and Trento University, 15 Sep 2001         |
   | Stefano Bettelli, IRSAMC, UPS, Toulouse,      25 Jun 2002         |
   | Stefano Bettelli, IRSAMC, UPS, Toulouse,      02 Nov 2002         |
   | Stefano Bettelli, IRSAMC, UPS, Toulouse,      01 Feb 2003         |
   ********************************************************************* */
Qop::Qop(const Qop &an_operator, size_type control_size) {
  /* no controls is a stupid idea. Anyway, let us get out of the
     problem by defining this case as a simple copy of the passed
     operator and returning immediately. */
  if (control_size == 0) { *this = an_operator; return; }
  /* inspect the passed operator and calculate the parallelisation
     degree (i.e. the maximum number of addresses in a single address
     list for a time slice from this operator) for later use. */
  size_type parallelisation = an_operator.parallelisation();
  /* if parallelisation is zero, something is wrong. Return an empty
     operator in order to recover from this error. */
  if (parallelisation == 0) return;
  /* calculate the number of qubits to be used as additional ancillae
     in the preparation circuit, i.e. control_size + parallelisation - 2.
     Since control_size and parallelisation are at least 1, this is
     always a positive number or zero. */
  size_type additional_ancillae = (control_size + parallelisation - 2);
  /* now we are going to create the first part of the circuit (that
     which calculates the AND of all controls). Let us put it in a
     separate operator (this will facilitate adjoining later). The 
     circuit is actually built by a separate routine. */
  Qop preparation_circuit = generate_AND_circuit(control_size);
  /* if "parallelisation" is greater than one, we want to build copies
     of the single overall control qubit that we have created. */
  if (parallelisation > 1) {
    /* build the second part of the circuit, replicating the AND qubit
       k="parallelisation" times (creating a big Schroedinger cat).
       Also this part is built by a separate routine. */
    Qop copy_circuit = generate_copy_circuit(parallelisation);
    /* now we have to synchronise the use of ancilla qubits between the
       two circuits we have built. This can be done by shifting down the
       AND circuit by parallelisation - 1 (i.e. the qubits with lowest
       indexes are the copies of the overall control qubit). We are
       assuming here that "preparation_circuit" has no explicitely set
       ancillae, so offset really moves all the formal addresses. */ 
    preparation_circuit.offset(parallelisation - 1);
    /* Now compose the two operators; indeed, since copy_circuit is
       going to die as soon as this block ends, this is the time to use
       operator<< instead of operator&= (leaving nothing in copy_circuit). */
    preparation_circuit << copy_circuit;
  }
  /* now, pass the preparation circuit, the passed operator and the 
     number of additional ancillae to a method of the Qop_slicelist
     class which does the rest of the work. The returned slice list
     is cannibalised by the current operator. */
  Qop controlled(~an_operator.get_operations().controlled
		 (preparation_circuit.get_operations(),
		  additional_ancillae + control_size, additional_ancillae));
  *this << controlled;
}

/* *********************************************************************
   | This function will generate a circuit which calculates the AND of |
   | n="control_size" qubits. The circuit needs n-1 additional ancilla |
   | qubits for the calculation, and approximately log(n) time slices. |
   | The first time slice calculates ~ n/2 Toffoli gates using ~ n/2   |
   | ancillae. The second time slice calculates ~ n/4 Toffoli gates    |
   | with these ancillae as inputs and writes the results in ~ n/8     |
   | additional ancillae. In the optimal case (when n=2^q) this takes  |
   | exactly q time slices and uses n-1 ancillae.                      |
   | ----------------------------------------------------------------- |
   | This call returns immediately if C="control_size" is 0 or 1. The  |
   | result (the overall AND) is calculated in the first qubit line    |
   | (that with index 0); the control register is assumed to be found  |
   | in [C-1, 2*C-2]. When C is greater than 2 we will have "garbage"  |
   | in the middle.                                                    |
   | ----------------------------------------------------------------- |
   | (01 Nov 2002) S.Bettelli, corrected a terrible bug which went un- |
   | noticed for a long time: when updating control_2, one should use  |
   | assignment (=), not subtraction (-=).                             |
   |                                                                   |
   | Stefano Bettelli, INFN and Trento University, 27 Sep 2001         |
   ********************************************************************* */
static Qop generate_AND_circuit(Qop::size_type control_size) {
  /* the operator to be returned. */
  Qop circuit;
  /* If the number of controls is 0 this is a dummy call. If it is 1,
     this single control line is already the "AND". In both cases it
     is sufficient to return the empty operator. */
  if (control_size < 2) return circuit;
  /* calculate the number of Toffoli gates which can be executed in 
     parallel during the first time slice. It is the number of controls
     divided by two, rounded to the lower integer. This variable will
     be updated during the construction of the circuit. */
  Qop::size_type parallel_gates = (control_size / 2);
  /* the following three variables are running indexes which facilitate
     the construction of the address lists for the Toffoli gates. */
  Qop::size_type control_1 = 2*control_size - 2;
  Qop::size_type control_2 = control_1 - parallel_gates;
  Qop::size_type target    = control_size - 2;
  /* build all the time slices. The last time slice will be a single
     Toffoli gate. Check parallel_gates as end test for the circuit. */
  while (parallel_gates != 0) {
    /* create a time slice which is made by a bunch of Toffoli gates.
       It is simpler (?) to build the address lists with the constructor
       which accepts the first and last address of the single segment
       in each list (this constructor needs a boolean dummy argument
       for disambiguation). */
    QToffoli bunch_of_toffolis
      (Qubit_list(control_1, control_1 - parallel_gates + 1, true),
       Qubit_list(control_2, control_2 - parallel_gates + 1, true),
       Qubit_list(target   ,    target - parallel_gates + 1, true));
    /* push the previous time slice into the circuit we are building. */
    circuit << bunch_of_toffolis;
    /* update all the address variables (subtraction). target must run
       slower than control_2 which in turn must run slower than control_1.
       First, control_1 and target are updated, by shifting them by two
       and one block or parallel_gates respectively. Then the block size
       is updated, as the integer half difference of control_1 and target.
       Last, control_2 is calculated as control_1 minus one (new) block. */
    control_1 -= 2*parallel_gates;           // skip two (old) control sets
    target    -=   parallel_gates;           // skip one (old) control set
    parallel_gates = (control_1 - target)/2; // recalculate parallelisation
    control_2  = control_1 - parallel_gates; // skip one (new) control set
  }
  /* return the circuit which we have built and hope in the named
     return value optimisation of our compiler. */
  return circuit;
}

/* *********************************************************************
   | This function will generate a circuit which replicates one qubit  |
   | into "copy_size"=k copies, given that the input qubit's state is  |
   | a computational basis state. The circuit needs k-1 additional     |
   | ancilla qubits for the calculation, and approximately log(k) time |
   | slices. The first time slice calculates 1 CNOT gate, the second   |
   | time slice calculates 2 CNOT gates using the previous two qubits  |
   | as inputs, and so on ... In the optimal case (when k=2^q) this    |
   | takes exactly q time slices and uses k-1 ancillae.                |
   | ----------------------------------------------------------------- |
   | This call returns immediately if "copy_size" is < 2. The scratch  |
   | space of size "copy_size" - 1 is assumed to be found at the be-   |
   | ginning, while the qubit to be "copied" is the following one.     |
   |                                                                   |
   | Stefano Bettelli, INFN and Trento University, 27 Sep 2001         |
   ********************************************************************* */
static Qop generate_copy_circuit(Qop::size_type copy_size) {
  /* the operator to be returned. */
  Qop circuit;
  /* If the number of copies is 0 or 1, this is a dummy call.
     Return the empty operator. */
  if (copy_size < 2) return circuit;
  /* some variables and their initialisation. "parallel" is the number of
     parallel copies we are going to perform (initially 1). "left" is the
     number of copies we still have to perform (initially copy_size - 1 ).
     "root" is the index of the qubit to be copied with the highest index
     (it is the initial qubit at the beginning, but it can be modified ..) */
  Qop::size_type parallel = 1;
  Qop::size_type left = copy_size - 1;
  Qop::size_type root = copy_size - 1;
  /* continue copying till we have nothing "left" */
  while (left != 0) {
    /* if we have more parallelisation power than qubits left, reduce
       the former. Also move the root to lower indexes (I don't know
       if this is of any utility). */
    if (left < parallel) { 
      parallel = left;          // reduce parallelisation for last slice
      root = 2*parallel - 1;    // move the root to lower addresses
    }
    /* create a time slice which is made by a set of CNOT gates. It is
       simpler (?) to build the address lists with the constructor which
       accepts the first and last address of the single segment in each
       list (this constructor needs a boolean dummy argument for
       disambiguation). */
    QCnot bunch_of_cnots
      (Qubit_list(root, root - parallel + 1, true),
       Qubit_list(root - parallel, root - 2*parallel + 1, true));
    /* push the previous time slice into the circuit we are building. */
    circuit << bunch_of_cnots;
    /* update the parallelisation index. Every time it doubles, exception
       made for the last slice (which anyway we don't care about since
       the loop is going to die after it). Also update the number of
       qubits left: they decrease by the number of CNOTs pushed in the
       current slice (i.e. "parallel"). */
    left -= parallel;
    parallel *= 2;
  }
  /* return the circuit which we have built and hope in the named
     return value optimisation of our compiler. */
  return circuit;
}

//;;; Local Variables: ***
//;;; mode:C++ ***
//;;; End: ***
