Index: sage/ext/multi_modular.h
===================================================================
--- sage/ext/multi_modular.h	(revision 2847)
+++ sage/ext/multi_modular.h	(revision 2847)
@@ -0,0 +1,14 @@
+#define mod_int size_t
+
+
+/* This is the largest value we can do arithmatic on without risking overflowing. 
+ * It will be optimized to a constant if any optimization is turned on. 
+ * The expression below assumes unsigned. 
+ */
+#define MOD_INT_MAX ((1 << (sizeof(mod_int)*4)) - 1)
+#define MOD_INT_OVERFLOW ((1 << (sizeof(mod_int)*8)) - 1)
+
+/* Don't want to be up against the limit so 
+ * we have room to gather. 
+ */
+#define START_PRIME_MAX (MOD_INT_MAX/5)
Index: sage/ext/multi_modular.pxd
===================================================================
--- sage/ext/multi_modular.pxd	(revision 2653)
+++ sage/ext/multi_modular.pxd	(revision 2653)
@@ -0,0 +1,19 @@
+include "../ext/cdefs.pxi"
+
+
+cdef extern from "multi_modular.h":
+    ctypedef unsigned long mod_int
+    mod_int MOD_INT_MAX
+    mod_int START_PRIME_MAX
+    
+cdef class MultiModularBasis:
+    cdef int      moduli_count
+    cdef mod_int* moduli
+    cdef mpz_t*   moduli_partial_product
+    cdef mod_int* C # precomputed values for CRT
+    
+    cdef int _extend_moduli_list(self, mpz_t height) except -1
+    cdef int moduli_list_c(self, mod_int** moduli, mpz_t height) except -1
+    cdef int mpz_crt(self, mpz_t z, mod_int* b, int n) except -1
+    cdef int mpz_crt_vec(self, mpz_t* z, mod_int** b, int n, int vn) except -1
+    
Index: sage/ext/multi_modular.pyx
===================================================================
--- sage/ext/multi_modular.pyx	(revision 2653)
+++ sage/ext/multi_modular.pyx	(revision 2653)
@@ -0,0 +1,159 @@
+include "../ext/interrupt.pxi"
+include "../ext/stdsage.pxi"
+include "../ext/gmp.pxi"
+
+
+from sage.rings.integer_ring import ZZ
+from sage.rings.arith import next_prime
+
+from sage.rings.integer import Integer
+from sage.rings.integer cimport Integer
+
+# should I have mod_int versions of these functions? 
+# c_inverse_mod_longlong modular inverse used exactly once in _extend_moduli_list
+from sage.ext.arith cimport arith_llong
+cdef arith_llong ai
+ai = arith_llong()
+
+cdef class MultiModularBasis:
+
+    def __init__(self, start_prime=START_PRIME_MAX):
+        cdef mod_int p
+        p = next_prime(start_prime-1)
+
+        self.moduli_count = 1
+
+        self.moduli = <mod_int*>sage_malloc(sizeof(mod_int))
+        self.moduli_partial_product = <mpz_t*>sage_malloc(sizeof(mpz_t))
+        self.C = <mod_int*>sage_malloc(sizeof(mod_int))
+
+        if self.moduli == NULL or self.moduli_partial_product == NULL or self.C == NULL:
+            raise MemoryError, "out of memory allocating multi-modular prime list"
+
+        self.moduli[0] = p
+        mpz_init_set_ui(self.moduli_partial_product[0], p)
+        self.C[0] = 1
+        
+    def __dealloc__(self):
+        sage_free(self.moduli)
+        sage_free(self.moduli_partial_product)
+        sage_free(self.C)
+
+        
+    cdef int _extend_moduli_list(self, mpz_t height) except -1:
+        if mpz_cmp(height, self.moduli_partial_product[self.moduli_count-1]) <= 0: return self.moduli_count
+        cdef int i
+        new_moduli = []
+        new_partial_products = []
+        new_C = []
+        cdef Integer p, M
+        p = PY_NEW(Integer)
+        mpz_set_ui(p.value, self.moduli[self.moduli_count-1])
+        M = PY_NEW(Integer)
+        mpz_set(M.value, self.moduli_partial_product[self.moduli_count-1])
+        while mpz_cmp(height, M.value) > 0:
+            p = next_prime(p)
+            new_moduli.append(p)
+            new_C.append(ai.c_inverse_mod_longlong(mpz_fdiv_ui(M.value, p), p))
+#            new_C.append((mod(M, p)**(-1)).lift())
+            M *= p
+            new_partial_products.append(M)
+            
+        cdef int new_count
+        new_count = self.moduli_count + len(new_moduli)
+        self.moduli = <mod_int*>sage_realloc(self.moduli, sizeof(mod_int)*new_count)
+        self.moduli_partial_product = <mpz_t*>sage_realloc(self.moduli_partial_product, sizeof(mpz_t)*new_count)
+        self.C = <mod_int*>sage_realloc(self.C, sizeof(mod_int)*new_count)
+        if self.moduli == NULL or self.moduli_partial_product == NULL or self.C == NULL:
+            raise MemoryError, "out of memory allocating multi-modular prime list"
+        
+        for i from self.moduli_count <= i < new_count:
+            self.moduli[i] = new_moduli[i-self.moduli_count]
+            self.C[i] = new_C[i-self.moduli_count]
+            mpz_init_set(self.moduli_partial_product[i], (<Integer>new_partial_products[i-self.moduli_count]).value)
+        self.moduli_count = new_count
+        return new_count
+        
+        
+    cdef int moduli_list_c(self, mod_int** moduli, mpz_t height) except -1:
+        self._extend_moduli_list(height)
+        
+        cdef int count
+        count = self.moduli_count * mpz_sizeinbase(height, 2) / mpz_sizeinbase(self.moduli_partial_product[self.moduli_count-1], 2) # an estimate
+        count = max(min(count, self.moduli_count), 1)
+        while count > 1 and mpz_cmp(height, self.moduli_partial_product[count-1]) < 0: 
+           count -= 1
+        while mpz_cmp(height, self.moduli_partial_product[count-1]) > 0: 
+           count += 1
+        
+        moduli[0] = <mod_int*>sage_malloc(sizeof(mod_int)*count)
+        if moduli[0] == NULL:
+            raise MemoryError, "out of memory allocating multi-modular prime list"
+        memcpy(moduli[0], self.moduli, sizeof(mod_int)*count)
+        return count
+        
+    cdef int mpz_crt(self, mpz_t z, mod_int* b, int n) except -1:
+        # Garner's Algorithm
+        # z is not yet initalized
+        cdef int i
+        cdef mpz_t u
+        mpz_init(u)
+        mpz_init_set_ui(z, b[0])
+        for i from 1 <= i < n:
+            mpz_set_ui(u, ((b[i] + self.moduli[i] - mpz_fdiv_ui(z, self.moduli[i])) * self.C[i]) % self.moduli[i])
+            mpz_mul(u, u, self.moduli_partial_product[i-1])
+            mpz_add(z, z, u)
+        return 0
+
+    cdef int mpz_crt_vec(self, mpz_t* z, mod_int** b, int n, int vc) except -1:
+        # Garner's Algorithm
+        # z allocated but not initalized
+        cdef int i, j
+        cdef mpz_t u
+        mpz_init(u)
+        for j from 0 <= j < vc:
+            mpz_init_set_ui(z[j], b[0][j])
+            for i from 1 <= i < n:
+                mpz_set_ui(u, ((b[i][j] + self.moduli[i] - mpz_fdiv_ui(z[j], self.moduli[i])) * self.C[i]) % self.moduli[i]) # u = ((b_i - z) * C_i) % m_i
+                mpz_mul(u, u, self.moduli_partial_product[i-1])
+                mpz_add(z[j], z[j], u)
+        return 0
+            
+    def moduli_list(self, height):
+        cdef Integer h
+        h = ZZ(height)
+        cdef int i, n
+        cdef mod_int* moduli
+        n = self.moduli_list_c(&moduli, h.value)
+        m = [ZZ(moduli[i]) for i from 0 <= i < n]
+        sage_free(moduli)
+        return m
+        
+    def crt(self, b):
+        cdef int i, n
+        n = len(b)
+        if n > self.moduli_count: 
+            raise IndexError, "beyond bound for multi-modular prime list"
+        cdef mod_int* bs
+        bs = <mod_int*>sage_malloc(sizeof(mod_int)*n)
+        if bs == NULL:
+            raise MemoryError, "out of memory allocating multi-modular prime list"
+        for i from 0 <= i < n:
+            bs[i] = b[i]
+        cdef Integer z
+        z = PY_NEW(Integer)
+        self.mpz_crt(z.value, bs, n)
+        sage_free(bs)
+        return z
+
+    def precomputations(self):
+        cdef int i
+        return [ZZ(self.C[i]) for i from 0 <= i < self.moduli_count]
+        
+    def partial_product(self, n):
+        if n > self.moduli_count: 
+            raise IndexError, "beyond bound for multi-modular prime list"
+        cdef Integer z
+        z = PY_NEW(Integer)
+        mpz_init_set(z.value, self.moduli_partial_product[self.moduli_count-1])
+        return z
Index: sage/matrix/matrix_integer_dense.pxd
===================================================================
--- sage/matrix/matrix_integer_dense.pxd	(revision 1769)
+++ sage/matrix/matrix_integer_dense.pxd	(revision 2653)
@@ -1,3 +1,6 @@
 include "../ext/cdefs.pxi"
+
+cdef extern from "../ext/multi_modular.h":
+    ctypedef unsigned long mod_int
 
 cimport matrix_dense
@@ -9,4 +12,5 @@
     cdef object _pivots
     cdef int mpz_height(self, mpz_t height) except -1
+    cdef _mod_int_c(self, mod_int modulus)
 
     cdef void _zero_out_matrix(self)
Index: sage/matrix/matrix_integer_dense.pyx
===================================================================
--- sage/matrix/matrix_integer_dense.pyx	(revision 2629)
+++ sage/matrix/matrix_integer_dense.pyx	(revision 2654)
@@ -20,7 +20,12 @@
 ctypedef unsigned int uint
 
+from sage.ext.multi_modular cimport MultiModularBasis
+cdef MultiModularBasis mm
+mm = MultiModularBasis()
+
 from sage.rings.integer cimport Integer
 from sage.rings.rational_field import QQ
 from sage.rings.integer_ring import ZZ
+from sage.rings.integer_mod_ring import IntegerModRing
 from sage.rings.polynomial_ring import PolynomialRing
 from sage.structure.element cimport ModuleElement
@@ -35,7 +40,4 @@
 import matrix_space
 
-# for multi-modular methods
-# TODO: make this a const, adjust for machine word size, put somewhere more reasonable
-start_prime = 32771
 
 cdef class Matrix_integer_dense(matrix_dense.Matrix_dense):   # dense or sparse
@@ -605,24 +607,26 @@
         
     def _multiply_multi_modular(left, Matrix_integer_dense right):
+    
+        cdef Integer h
+        cdef mod_int *moduli
+        cdef int i, n
+        
         h = left.height() * right.height()
-        product = start_prime
-#        moduli = [ start_prime ]
-        residues = [ left._mod_int(start_prime) * right._mod_int(start_prime) ]
-        cur_prime = start_prime
-        while product < height:
-            cur_prime = next_prime(cur_prime)
-            product *= cur_prime
-#            moduli.append(cur_prime)
-            res.append(left._mod_int(cur_prime) * right_res._mod_int(cur_prime))
-        return left._lift_crt(residues)
+        n = mm.moduli_list_c(&moduli, h.value)
+        res = []
+        for i from 0 <= i < n:
+            res.append(left._mod_int_c(moduli[i]) * right._mod_int_c(moduli[i]))
+        sage_free(moduli)
+        return left._lift_crt(res)
             
     def _mod_int(self, modulus):
-        cdef int p
+        return self._mod_int_c(modulus)
+
+    cdef _mod_int_c(self, mod_int p):
         cdef Py_ssize_t i, j
-        p = modulus
         cdef Matrix_modn_dense res
         cdef mpz_t* self_row
-        cdef uint* res_row
-        res = Matrix_modn_dense.__new__(matrix_space.MatrixSpace(ZZ, self._nrows, self._ncols, sparse=False), None, None, None)
+        cdef mod_int* res_row
+        res = Matrix_modn_dense.__new__(Matrix_modn_dense, matrix_space.MatrixSpace(IntegerModRing(p), self._nrows, self._ncols, sparse=False), None, None, None)
         for i from 0 <= i < self._nrows:
             self_row = self._matrix[i]
@@ -632,6 +636,37 @@
         return res
         
-    def _lift_crt(residues):
-        raise NotImplementedError
+    def _lift_crt(self, residues):
+
+        cdef size_t n, i, j, k
+        cdef Py_ssize_t nr, nc
+        
+        n = len(residues)
+        nr = residues[0].nrows()
+        nc = residues[0].ncols()
+        
+        for b in residues:
+            if not PY_TYPE_CHECK(b, Matrix_modn_dense):
+                raise TypeError, "Can only perform CRT on list of type Matrix_modn_dense."
+        cdef PyObject** res
+        res = FAST_SEQ_UNSAFE(residues)
+
+        cdef mod_int **row_list
+        row_list = <mod_int**>sage_malloc(sizeof(mod_int*) * n)
+        if row_list == NULL:
+            raise MemoryError, "out of memory allocating multi-modular coefficent list"
+        
+        cdef Matrix_integer_dense M
+        M = Matrix_integer_dense.__new__(Matrix_integer_dense, self.matrix_space(nr, nc), None, None, None)
+        
+        _sig_on
+        for i from 0 <= i < nr:
+            for k from 0 <= k < n:
+                row_list[k] = (<Matrix_modn_dense>res[k]).matrix[i]
+            mm.mpz_crt_vec(M._matrix[i], row_list, n, nc)
+        _sig_off
+        
+        sage_free(row_list)
+        return M
+        
 
     def _echelon_in_place_classical(self):
Index: sage/matrix/matrix_modn_dense.pxd
===================================================================
--- sage/matrix/matrix_modn_dense.pxd	(revision 1772)
+++ sage/matrix/matrix_modn_dense.pxd	(revision 2847)
@@ -1,21 +1,24 @@
 cimport matrix_dense
 
-ctypedef unsigned int uint
+cdef extern from "../ext/multi_modular.h":
+    ctypedef unsigned long mod_int
+    mod_int MOD_INT_MAX
+    mod_int MOD_INT_OVERFLOW
 
 cdef class Matrix_modn_dense(matrix_dense.Matrix_dense):
-    cdef uint **matrix
-    cdef uint *_entries
-    cdef uint p
-    cdef uint gather
+    cdef mod_int **matrix
+    cdef mod_int *_entries
+    cdef mod_int p
+    cdef mod_int gather
 
-    #cdef set_matrix(Matrix_modn_dense self, uint **m)
-    #cdef uint **get_matrix(Matrix_modn_dense self)
-    #cdef uint entry(self, uint i, uint j)
-    cdef _rescale_row_c(self, Py_ssize_t row, uint multiple, Py_ssize_t start_col)
-    cdef _rescale_col_c(self, Py_ssize_t col, uint multiple, Py_ssize_t start_row)    
+    #cdef set_matrix(Matrix_modn_dense self, mod_int **m)
+    #cdef mod_int **get_matrix(Matrix_modn_dense self)
+    #cdef mod_int entry(self, mod_int i, mod_int j)
+    cdef _rescale_row_c(self, Py_ssize_t row, mod_int multiple, Py_ssize_t start_col)
+    cdef _rescale_col_c(self, Py_ssize_t col, mod_int multiple, Py_ssize_t start_row)    
     cdef _add_multiple_of_row_c(self,  Py_ssize_t row_to, Py_ssize_t row_from,
-                                uint multiple, Py_ssize_t start_col)
+                                mod_int multiple, Py_ssize_t start_col)
     cdef _add_multiple_of_column_c(self, Py_ssize_t col_to, Py_ssize_t col_from,
-                                   uint multiple, Py_ssize_t start_row)
+                                   mod_int multiple, Py_ssize_t start_row)
     
 
Index: sage/matrix/matrix_modn_dense.pyx
===================================================================
--- sage/matrix/matrix_modn_dense.pyx	(revision 2629)
+++ sage/matrix/matrix_modn_dense.pyx	(revision 2847)
@@ -112,21 +112,21 @@
         matrix_dense.Matrix_dense.__init__(self, parent)
 
-        cdef uint p
+        cdef mod_int p
         p = self._base_ring.characteristic()
         self.p = p
-        if p >= 46340:
-            raise OverflowError, "p (=%s) must be < 46340"%p
-        self.gather = 2**32/(p*p)
+        if p >= MOD_INT_MAX:
+            raise OverflowError, "p (=%s) must be < %s"%(p, MOD_INT_MAX)
+        self.gather = MOD_INT_OVERFLOW / (p*p)
             
-        self._entries = <uint *> sage_malloc(sizeof(uint)*self._nrows*self._ncols)
+        self._entries = <mod_int *> sage_malloc(sizeof(mod_int)*self._nrows*self._ncols)
         if self._entries == NULL:
            raise MemoryError, "Error allocating matrix"        
 
-        self.matrix = <uint **> sage_malloc(sizeof(uint*)*self._nrows)
+        self.matrix = <mod_int **> sage_malloc(sizeof(mod_int*)*self._nrows)
         if self.matrix == NULL:
             sage_free(self._entries)
             raise MemoryError, "Error allocating memory"
 
-        cdef uint k
+        cdef mod_int k
         cdef Py_ssize_t i
         k = 0
@@ -143,7 +143,7 @@
     def __init__(self, parent, entries, copy, coerce):
         
-        cdef uint e
+        cdef mod_int e
         cdef Py_ssize_t i, j, k
-        cdef uint *v
+        cdef mod_int *v
 
         # scalar?
@@ -164,5 +164,5 @@
         
         k = 0
-        cdef uint n
+        cdef mod_int n
         R = self.base_ring()
 
@@ -229,5 +229,5 @@
         A = Matrix_modn_dense.__new__(Matrix_modn_dense, self._parent,
                                       0, 0, 0)
-        memcpy(A._entries, self._entries, sizeof(uint)*self._nrows*self._ncols)
+        memcpy(A._entries, self._entries, sizeof(mod_int)*self._nrows*self._ncols)
         A.p = self.p
         A.gather = self.gather
@@ -257,6 +257,6 @@
 
         cdef Py_ssize_t start_row, c, r, nr, nc, i
-        cdef uint p, a, a_inverse, b
-        cdef uint **m
+        cdef mod_int p, a, a_inverse, b
+        cdef mod_int **m
 
         start_row = 0
@@ -295,7 +295,7 @@
         self._rescale_row_c(row, multiple, start_col)
         
-    cdef _rescale_row_c(self, Py_ssize_t row, uint multiple, Py_ssize_t start_col):
-        cdef uint r, p
-        cdef uint* v
+    cdef _rescale_row_c(self, Py_ssize_t row, mod_int multiple, Py_ssize_t start_col):
+        cdef mod_int r, p
+        cdef mod_int* v
         cdef Py_ssize_t i
         p = self.p
@@ -307,5 +307,5 @@
         self._rescale_col_c(col, multiple, start_row)
         
-    cdef _rescale_col_c(self, Py_ssize_t col, uint multiple, Py_ssize_t start_row):
+    cdef _rescale_col_c(self, Py_ssize_t col, mod_int multiple, Py_ssize_t start_row):
         """
         EXAMPLES:
@@ -339,6 +339,6 @@
             [ 2  5 34]            
         """
-        cdef uint r, p
-        cdef uint* v
+        cdef mod_int r, p
+        cdef mod_int* v
         cdef Py_ssize_t i
         p = self.p
@@ -350,8 +350,8 @@
         self._add_multiple_of_row_c(row_to, row_from, multiple, start_col)
         
-    cdef _add_multiple_of_row_c(self,  Py_ssize_t row_to, Py_ssize_t row_from, uint multiple,
+    cdef _add_multiple_of_row_c(self,  Py_ssize_t row_to, Py_ssize_t row_from, mod_int multiple,
                                Py_ssize_t start_col):
-        cdef uint p
-        cdef uint *v_from, *v_to
+        cdef mod_int p
+        cdef mod_int *v_from, *v_to
 
         p = self.p
@@ -368,8 +368,8 @@
         self._add_multiple_of_column_c(col_to, col_from, s, start_row)
 
-    cdef _add_multiple_of_column_c(self, Py_ssize_t col_to, Py_ssize_t col_from, uint multiple,
+    cdef _add_multiple_of_column_c(self, Py_ssize_t col_to, Py_ssize_t col_from, mod_int multiple,
                                    Py_ssize_t start_row):
-        cdef uint  p
-        cdef uint **m
+        cdef mod_int  p
+        cdef mod_int **m
         
         m = self.matrix
@@ -382,5 +382,5 @@
 
     cdef swap_rows_c(self, Py_ssize_t row1, Py_ssize_t row2):
-        cdef uint* temp
+        cdef mod_int* temp
         temp = self.matrix[row1]
         self.matrix[row1] = self.matrix[row2]
@@ -389,6 +389,6 @@
     cdef swap_columns_c(self, Py_ssize_t col1, Py_ssize_t col2):
         cdef Py_ssize_t i, nr
-        cdef uint t
-        cdef uint **m
+        cdef mod_int t
+        cdef mod_int **m
         m = self.matrix
         nr = self._nrows
@@ -412,8 +412,8 @@
         n = self._nrows
 
-        cdef uint **h
+        cdef mod_int **h
         h = self.matrix
 
-        cdef uint p, r, t, t_inv, u
+        cdef mod_int p, r, t, t_inv, u
         cdef Py_ssize_t i, j, m
         p = self.p
@@ -477,5 +477,5 @@
         n = self._nrows
 
-        cdef uint p, t
+        cdef mod_int p, t
         p = self.p
 
@@ -529,5 +529,6 @@
     #  A = matrix(Integers(389),4,range(16)); A._echelon_strassen(4)
     # *** glibc detected *** free(): invalid next size (fast): 0x0000000000fb15e0 ***
-    def xxx_matrix_window(self, Py_ssize_t row=0, Py_ssize_t col=0,
+    # error in set_to memcpy on 64-bit
+    def matrix_window(self, Py_ssize_t row=0, Py_ssize_t col=0,
                       Py_ssize_t nrows=-1, Py_ssize_t ncols=-1):
         """
@@ -550,4 +551,5 @@
             <type 'sage.matrix.matrix_modn_dense.Matrix_modn_dense'>
         """
+        print "making matrix window"
         if nrows == -1:
             nrows = self._nrows - row
Index: sage/matrix/matrix_rational_dense.pyx
===================================================================
--- sage/matrix/matrix_rational_dense.pyx	(revision 2628)
+++ sage/matrix/matrix_rational_dense.pyx	(revision 2654)
@@ -439,5 +439,5 @@
         A, A_denom = left._clear_denom()
         B, B_denom = right._clear_denom()
-        AB = A*B # A._multiply_multi_modular(B)
+        AB = A._multiply_multi_modular(B)
         D = A_denom * B_denom
         res = Matrix_rational_dense.__new__(Matrix_rational_dense, left.matrix_space(AB._nrows, AB._ncols), 0, 0, 0)
Index: sage/matrix/matrix_window_modn_dense.pxd
===================================================================
--- sage/matrix/matrix_window_modn_dense.pxd	(revision 1872)
+++ sage/matrix/matrix_window_modn_dense.pxd	(revision 2847)
@@ -1,5 +1,7 @@
 from matrix_window cimport MatrixWindow
 
-ctypedef unsigned int uint
+cdef extern from "../ext/multi_modular.h":
+    ctypedef unsigned long mod_int
+    mod_int MOD_INT_MAX
 
 cdef class MatrixWindow_modn_dense(MatrixWindow):
Index: sage/matrix/matrix_window_modn_dense.pyx
===================================================================
--- sage/matrix/matrix_window_modn_dense.pyx	(revision 1993)
+++ sage/matrix/matrix_window_modn_dense.pyx	(revision 2847)
@@ -8,4 +8,5 @@
 from sage.matrix.matrix_modn_dense cimport Matrix_modn_dense
 
+# TODO: what if we used the actual pointers for our loop variables?
 
 cdef class MatrixWindow_modn_dense(matrix_window.MatrixWindow):
@@ -23,6 +24,6 @@
         """
         cdef Py_ssize_t i, j
-        cdef uint** self_rows
-        cdef uint** A_rows
+        cdef mod_int** self_rows
+        cdef mod_int** A_rows
         if self._nrows != A._nrows or self._ncols != A._ncols:
             raise ArithmeticError, "incompatible dimensions"
@@ -30,148 +31,206 @@
         A_rows = ( <Matrix_modn_dense> A._matrix ).matrix
         for i from 0 <= i < self._nrows:
-            memcpy(self_rows[i+self._row] + self._col, A_rows[i+A._row] + A._col, self._ncols * sizeof(uint*))
+            memcpy(self_rows[i+self._row] + self._col, A_rows[i+A._row] + A._col, self._ncols * sizeof(mod_int*))
                 
     cdef set_to_zero(MatrixWindow_modn_dense self):
         cdef Py_ssize_t i, j
-        cdef uint** rows
+        cdef mod_int** rows
         rows = ( <Matrix_modn_dense> self._matrix ).matrix
         for i from self._row <= i < self._row + self._nrows:
-            memset(rows[i] + self._col, 0, self._ncols * sizeof(uint*))
+            memset(rows[i] + self._col, 0, self._ncols * sizeof(mod_int*))
 
     cdef add(self, MatrixWindow A):
         cdef Py_ssize_t i, j
-        cdef uint p
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        if self._nrows != A._nrows or self._ncols != A._ncols:
-            raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < self._nrows:
-            for j from 0 <= j < self._ncols:
-                # I really want to do in place operations here...
-                self_matrix[i+self._row][j+self._col] = self_matrix[i+self._row][j+self._col] + A_matrix[i+A._row][j+A._col]
-                if self_matrix[i+self._row][j+self._col] >= p:
-                    self_matrix[i+self._row][j+self._col] = self_matrix[i+self._row][j+self._col] - p
+        cdef mod_int p
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        if self._nrows != A._nrows or self._ncols != A._ncols:
+            raise ArithmeticError, "incompatible dimensions"
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        for i from 0 <= i < self._nrows:
+            self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i + self._row] + self._col
+            A_row    = ( <Matrix_modn_dense>    A._matrix ).matrix[i +    A._row] + A._col
+            for j from 0 <= j < self._ncols:
+                self_row[j] += A_row[j]
+                if self_row[j] >= p:
+                    self_row[j] -= p
     
     cdef subtract(self, MatrixWindow A):
         cdef Py_ssize_t i, j
-        cdef uint p
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        if self._nrows != A._nrows or self._ncols != A._ncols:
-            raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < self._nrows:
-            for j from 0 <= j < self._ncols:
-                # I really want to do in place operations here...
-                self_matrix[i+self._row][j+self._col] = p + self_matrix[i+self._row][j+self._col] - A_matrix[i+A._row][j+A._col]
-                if self_matrix[i+self._row][j+self._col] >= p:
-                    self_matrix[i+self._row][j+self._col] = self_matrix[i+self._row][j+self._col] - p
+        cdef mod_int p
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        if self._nrows != A._nrows or self._ncols != A._ncols:
+            raise ArithmeticError, "incompatible dimensions"
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        for i from 0 <= i < self._nrows:
+            self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i + self._row] + self._col
+            A_row    = ( <Matrix_modn_dense>    A._matrix ).matrix[i +    A._row] + A._col
+            for j from 0 <= j < self._ncols:
+                if self_row[j] >= A_row[j]:
+                    self_row[j] -= A_row[j]
+                else:
+                    self_row[j] += p - A_row[j]
         
     cdef set_to_sum(self, MatrixWindow A, MatrixWindow B):
         cdef Py_ssize_t i, j
-        cdef uint p
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        cdef uint** B_matrix
+        cdef mod_int p
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        cdef mod_int* B_row
         if self._nrows != A._nrows or self._ncols != A._ncols:
             raise ArithmeticError, "incompatible dimensions"
         if self._nrows != B._nrows or self._ncols != B._ncols:
             raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        B_matrix = ( <Matrix_modn_dense> B._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < self._nrows:
-            for j from 0 <= j < self._ncols:
-                self_matrix[i+self._row][j+self._col] = A_matrix[i+A._row][j+A._col] + B_matrix[i+B._row][j+B._col]
-                if self_matrix[i+self._row][j+self._col] >= p:
-                    # I really want to do in place operations here...
-                    self_matrix[i+self._row][j+self._col] = self_matrix[i+self._row][j+self._col] - p
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        for i from 0 <= i < self._nrows:
+            self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i + self._row] + self._col
+            A_row    = ( <Matrix_modn_dense>    A._matrix ).matrix[i +    A._row] + A._col
+            B_row    = ( <Matrix_modn_dense>    B._matrix ).matrix[i +    B._row] + B._col
+            for j from 0 <= j < self._ncols:
+                self_row[j] = A_row[j] + B_row[j]
+                if self_row[j] >= p:
+                    self_row[j] -= p
 
     cdef set_to_diff(self, MatrixWindow A, MatrixWindow B):
         cdef Py_ssize_t i, j
-        cdef uint p
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        cdef uint** B_matrix
+        cdef mod_int p
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        cdef mod_int* B_row
         if self._nrows != A._nrows or self._ncols != A._ncols:
             raise ArithmeticError, "incompatible dimensions"
         if self._nrows != B._nrows or self._ncols != B._ncols:
             raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        B_matrix = ( <Matrix_modn_dense> B._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < self._nrows:
-            for j from 0 <= j < self._ncols:
-                self_matrix[i+self._row][j+self._col] =  p + A_matrix[i+A._row][j+A._col] - B_matrix[i+B._row][j+B._col]
-                if self_matrix[i+self._row][j+self._col] >= p:
-                    # I really want to do in place operations here...
-                    self_matrix[i+self._row][j+self._col] = self_matrix[i+self._row][j+self._col] - p
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        for i from 0 <= i < self._nrows:
+            self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i + self._row] + self._col
+            A_row    = ( <Matrix_modn_dense>    A._matrix ).matrix[i +    A._row] + A._col
+            B_row    = ( <Matrix_modn_dense>    B._matrix ).matrix[i +    B._row] + B._col
+            for j from 0 <= j < self._ncols:
+                if A_row[j] > B_row[j]:
+                    self_row[j] = A_row[j] - B_row[j]
+                else:
+                    self_row[j] = p + A_row[j] - B_row[j]
                 
     cdef set_to_prod(self, MatrixWindow A, MatrixWindow B):
-        # TODO: gather
-        cdef Py_ssize_t i, j, k
-        cdef uint p, s
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        cdef uint** B_matrix
+        cdef Py_ssize_t i, j, k, gather, top, A_ncols
+        cdef mod_int p, s
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        cdef mod_int** B_matrix_off
         if A._ncols != B._nrows or self._nrows != A._nrows or self._ncols != B._ncols:
             raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        B_matrix = ( <Matrix_modn_dense> B._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < A._nrows:
-            for j from 0 <= j < B._ncols:
-                s = 0
-                for k from 0 <= k < A._ncols:
-                    s = (s + A_matrix[i+A._row][k+A._col] * B_matrix[k+B._row][j+B._col]) % p
-                self_matrix[i+self._row][j+self._col] = s % p
+        B_matrix_off = ( <Matrix_modn_dense> B._matrix ).matrix + B._row
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        gather = ( <Matrix_modn_dense> self._matrix ).gather
+        A_ncols = A._ncols
+        
+        if gather <= 1:
+            for i from 0 <= i < A._nrows:
+                self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i + self._row] + self._col
+                A_row    = ( <Matrix_modn_dense>    A._matrix ).matrix[i +    A._row] + A._col
+                for j from 0 <= j < B._ncols:
+                    s = 0
+                    for k from 0 <= k < A._ncols:
+                        s = (s + A_row[k] * B_matrix_off[k][j+B._col]) % p
+                    self_row[j] = s
+        else:
+            for i from 0 <= i < A._nrows:
+                self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i + self._row] + self._col
+                A_row    = ( <Matrix_modn_dense>    A._matrix ).matrix[i +    A._row] + A._col
+                for j from 0 <= j < B._ncols:
+                    s = 0
+                    k = 0
+                    while k < A_ncols:
+                        top = k + gather
+                        if top > A_ncols:
+                            top = A_ncols
+                        for k from k <= k < top: # = min(k+gather, A._ncols)
+                            s += A_row[k] * B_matrix_off[k][j+B._col]
+                        s %= p
+                    self_row[j] = s
         
     cdef add_prod(self, MatrixWindow A, MatrixWindow B):
         # TODO: gather
-        cdef Py_ssize_t i, j, k
-        cdef uint p, s
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        cdef uint** B_matrix
+        cdef Py_ssize_t i, j, k, A_ncols
+        cdef mod_int p, s
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        cdef mod_int* B_row
         if A._ncols != B._nrows or self._nrows != A._nrows or self._ncols != B._ncols:
             raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        B_matrix = ( <Matrix_modn_dense> B._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < A._nrows:
-            for j from 0 <= j < B._ncols:
-                s = self_matrix[i+self._row][j+self._col]
-                for k from 0 <= k < A._ncols:
-                    s = ( s + A_matrix[i+A._row][k+A._col] * B_matrix[k+B._row][j+B._col] ) % p
-                self_matrix[i+self._row][j+self._col] = s % p
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        gather = ( <Matrix_modn_dense> self._matrix ).gather
+        A_ncols = A._ncols
+        
+        if gather <= 1:
+            for i from 0 <= i < A._nrows:
+                self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i+self._row] + self._col
+                A_row = ( <Matrix_modn_dense> A._matrix ).matrix[i+B._row] + B._col
+                B_row = ( <Matrix_modn_dense> B._matrix ).matrix[i+A._row] + A._col
+                for j from 0 <= j < B._ncols:
+                    s = self_row[j]
+                    for k from 0 <= k < A._ncols:
+                        s = ( s + A_row[k] * B_row[j] ) % p
+                    self_row[j] = s
+                    
+        else:
+            for i from 0 <= i < A._nrows:
+                self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i+self._row] + self._col
+                A_row = ( <Matrix_modn_dense> A._matrix ).matrix[i+B._row] + B._col
+                B_row = ( <Matrix_modn_dense> B._matrix ).matrix[i+A._row] + A._col
+                for j from 0 <= j < B._ncols:
+                    s = self_row[j]
+                    k = 0
+                    while k < A_ncols:
+                        top = k + gather
+                        if top > A_ncols:
+                            top = A_ncols
+                        for k from k <= k < top: # = min(k+gather, A._ncols)
+                            s += A_row[k] * B_matrix_off[k][j+B._col]
+                        s %= p
+                    self_row[j] = s
+                    
                 
     cdef subtract_prod(self, MatrixWindow A, MatrixWindow B):
         # TODO: gather
-        cdef Py_ssize_t i, j, k
-        cdef uint p, s
-        cdef uint** self_matrix
-        cdef uint** A_matrix
-        cdef uint** B_matrix
+        cdef Py_ssize_t i, j, k, A_ncols
+        cdef mod_int p, s
+        cdef mod_int* self_row
+        cdef mod_int* A_row
+        cdef mod_int* B_row
         if A._ncols != B._nrows or self._nrows != A._nrows or self._ncols != B._ncols:
             raise ArithmeticError, "incompatible dimensions"
-        self_matrix = ( <Matrix_modn_dense> self._matrix ).matrix
-        A_matrix = ( <Matrix_modn_dense> A._matrix ).matrix
-        B_matrix = ( <Matrix_modn_dense> B._matrix ).matrix
-        p = ( <Matrix_modn_dense> self._matrix ).p
-        for i from 0 <= i < A._nrows:
-            for j from 0 <= j < B._ncols:
-                s = self_matrix[i+self._row][j+self._col]
-                for k from 0 <= k < A._ncols:
-                    s = s + p - ( A_matrix[i+A._row][k+A._col] * B_matrix[k+B._row][j+B._col] ) % p
-                self_matrix[i+self._row][j+self._col] = s % p
+        p = ( <Matrix_modn_dense> self._matrix ).p
+        A_ncols = A._ncols
+        
+        if gather <= 1:
+            for i from 0 <= i < A._nrows:
+                self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i+self._row] + self._col
+                A_row = ( <Matrix_modn_dense> A._matrix ).matrix[i+B._row] + B._col
+                B_row = ( <Matrix_modn_dense> B._matrix ).matrix[i+A._row] + A._col
+                for j from 0 <= j < B._ncols:
+                    s = self_row[j]
+                    for k from 0 <= k < A._ncols:
+                        s = s + p - ( A_row[k] * B_row[j] ) % p
+                    self_row[j] = s
+                    
+        else:
+            for i from 0 <= i < A._nrows:
+                self_row = ( <Matrix_modn_dense> self._matrix ).matrix[i+self._row] + self._col
+                A_row = ( <Matrix_modn_dense> A._matrix ).matrix[i+B._row] + B._col
+                B_row = ( <Matrix_modn_dense> B._matrix ).matrix[i+A._row] + A._col
+                for j from 0 <= j < B._ncols:
+                    s = self_row[j] + p * gather # because they're unsigned
+                    k = 0
+                    while k < A_ncols:
+                        top = k + gather
+                        if top > A_ncols:
+                            top = A_ncols
+                        for k from k <= k < top: # = min(k+gather, A._ncols)
+                            s -= A_row[k] * B_matrix_off[k][j+B._col]
+                        s = s % p  +  p * gather # because they're unsigne
+                    self_row[j] = s % p
 
     cdef int element_is_zero(self, Py_ssize_t i, Py_ssize_t j):
Index: setup.py
===================================================================
--- setup.py	(revision 2632)
+++ setup.py	(revision 2847)
@@ -314,4 +314,8 @@
     Extension('sage.ext.arith_gmp',
               sources = ['sage/ext/arith_gmp.pyx'],
+              libraries=['gmp']), \
+
+    Extension('sage.ext.multi_modular',
+              sources = ['sage/ext/multi_modular.pyx'],
               libraries=['gmp']), \
 
