mlir/test/Integration/Dialect/SparseTensor/taco/tools/mlir_pytaco_utils.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374

#  Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
#  See https://llvm.org/LICENSE.txt for license information.
#  SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception

# This file contains the utilities to process sparse tensor outputs.

from typing import Callable, Dict, Sequence, Tuple
import ctypes
import functools
import numpy as np
import os

# Import MLIR related modules.
from mlir import execution_engine
from mlir import ir
from mlir import runtime
from mlir.dialects import sparse_tensor

from . import mlir_sparse_compiler

# Type aliases for type annotation.
_SupportFunc = Callable[..., None]
_SupportFuncLocator = Callable[[np.dtype], Tuple[_SupportFunc, _SupportFunc]]

# The name for the environment variable that provides the full path for the
# supporting library.
_SUPPORTLIB_ENV_VAR = "SUPPORTLIB"
# The default supporting library if the environment variable is not provided.
_DEFAULT_SUPPORTLIB = "libmlir_c_runner_utils.so"

# The JIT compiler optimization level.
_OPT_LEVEL = 2
# The entry point to the JIT compiled program.
_ENTRY_NAME = "main"


@functools.lru_cache()
def _get_support_lib_name() -> str:
  """Gets the string name for the supporting C shared library."""
  return os.getenv(_SUPPORTLIB_ENV_VAR, _DEFAULT_SUPPORTLIB)


@functools.lru_cache()
def _get_sparse_compiler() -> mlir_sparse_compiler.SparseCompiler:
  """Gets the MLIR sparse compiler with default setting."""
  return mlir_sparse_compiler.SparseCompiler(
      options="", opt_level=_OPT_LEVEL, shared_libs=[_get_support_lib_name()])


def _record_support_funcs(
    ty: np.dtype, to_func: _SupportFunc, from_func: _SupportFunc,
    ty_to_funcs: Dict[np.dtype, Tuple[_SupportFunc, _SupportFunc]]) -> None:
  """Records the two supporting functions for a given data type."""
  to_func.restype = ctypes.c_void_p
  from_func.restype = ctypes.c_void_p
  ty_to_funcs[ty] = (to_func, from_func)


@functools.lru_cache()
def _get_support_func_locator() -> _SupportFuncLocator:
  """Constructs a function to locate the supporting functions for a data type.

  Loads the supporting C shared library with the needed routines. Constructs a
  dictionary from the supported data types to the routines for the data types,
  and then a function to look up the dictionary for a given data type.

  The name of the supporting C shared library is either provided by an
  an environment variable or a default value.

  Returns:
    The function to look up the supporting functions for a given data type.

  Raises:
    OSError: If there is any problem in loading the shared library.
    ValueError: If the shared library doesn't contain the needed routines.
  """
  # This raises OSError exception if there is any problem in loading the shared
  # library.
  c_lib = ctypes.CDLL(_get_support_lib_name())

  type_to_funcs = {}
  try:
    support_types = [(np.int8, c_lib.convertToMLIRSparseTensorI8,
                      c_lib.convertFromMLIRSparseTensorI8),
                     (np.int16, c_lib.convertToMLIRSparseTensorI16,
                      c_lib.convertFromMLIRSparseTensorI16),
                     (np.int32, c_lib.convertToMLIRSparseTensorI32,
                      c_lib.convertFromMLIRSparseTensorI32),
                     (np.int64, c_lib.convertToMLIRSparseTensorI64,
                      c_lib.convertFromMLIRSparseTensorI64),
                     (np.float16, c_lib.convertToMLIRSparseTensorF16,
                      c_lib.convertFromMLIRSparseTensorF16),
                     (np.float32, c_lib.convertToMLIRSparseTensorF32,
                      c_lib.convertFromMLIRSparseTensorF32),
                     (np.float64, c_lib.convertToMLIRSparseTensorF64,
                      c_lib.convertFromMLIRSparseTensorF64),
                     (np.complex64, c_lib.convertToMLIRSparseTensorC32,
                      c_lib.convertFromMLIRSparseTensorC32),
                     (np.complex128, c_lib.convertToMLIRSparseTensorC64,
                      c_lib.convertFromMLIRSparseTensorC64)]
  except Exception as e:
    raise ValueError(f"Missing supporting function: {e}") from e
  for i, info in enumerate(support_types):
    _record_support_funcs(info[0], info[1], info[2], type_to_funcs)

  def get_support_funcs(ty: np.dtype):
    funcs = type_to_funcs[ty]
    assert funcs is not None
    return funcs

  return get_support_funcs


def sparse_tensor_to_coo_tensor(
    sparse_tensor: ctypes.c_void_p,
    dtype: np.dtype,
) -> Tuple[int, int, np.ndarray, np.ndarray, np.ndarray]:
  """Converts an MLIR sparse tensor to a COO-flavored format tensor.

  Args:
     sparse_tensor: A ctypes.c_void_p to the MLIR sparse tensor descriptor.
     dtype: The numpy data type for the tensor elements.

  Returns:
    A tuple that contains the following values for the COO-flavored format
    tensor:
    rank: An integer for the rank of the tensor.
    nse: An integer for the number of non-zero values in the tensor.
    shape: A 1D numpy array of integers, for the shape of the tensor.
    values: A 1D numpy array, for the non-zero values in the tensor.
    indices: A 2D numpy array of integers, representing the indices for the
      non-zero values in the tensor.

  Raises:
    OSError: If there is any problem in loading the shared library.
    ValueError: If the shared library doesn't contain the needed routines.
  """
  convert_from = _get_support_func_locator()(dtype)[1]
  rank = ctypes.c_ulonglong(0)
  nse = ctypes.c_ulonglong(0)
  shape = ctypes.POINTER(ctypes.c_ulonglong)()

  values = ctypes.POINTER(runtime.as_ctype(np.dtype(dtype)))()
  indices = ctypes.POINTER(ctypes.c_ulonglong)()
  convert_from(sparse_tensor, ctypes.byref(rank), ctypes.byref(nse),
               ctypes.byref(shape), ctypes.byref(values), ctypes.byref(indices))

  # Convert the returned values to the corresponding numpy types.
  shape = np.ctypeslib.as_array(shape, shape=[rank.value])
  values = runtime.to_numpy(np.ctypeslib.as_array(values, shape=[nse.value]))
  indices = np.ctypeslib.as_array(indices, shape=[nse.value, rank.value])
  return rank.value, nse.value, shape, values, indices


def coo_tensor_to_sparse_tensor(np_shape: np.ndarray, np_values: np.ndarray,
                                np_indices: np.ndarray, np_perm: np.ndarray,
                                np_sparse: np.ndarray) -> int:
  """Converts a COO-flavored format sparse tensor to an MLIR sparse tensor.

  Args:
     np_shape: A 1D numpy array of integers, for the shape of the tensor.
     np_values: A 1D numpy array, for the non-zero values in the tensor.
     np_indices: A 2D numpy array of integers, representing the indices for the
       non-zero values in the tensor.
     np_perm: A 1D numpy array of integers, representing the storage ordering
       for the dimensions.
     np_sparse: A 1D numpy array of uint8, representing the sparsity values
       for the dimensions.

  Returns:
     An integer for the non-null ctypes.c_void_p to the MLIR sparse tensor
     descriptor.

  Raises:
    OSError: If there is any problem in loading the shared library.
    ValueError: If the shared library doesn't contain the needed routines.
  """

  r = len(np_shape)
  rank = ctypes.c_ulonglong(r)
  nse = ctypes.c_ulonglong(len(np_values))
  shape = np_shape.ctypes.data_as(ctypes.POINTER(ctypes.c_ulonglong))
  values = np_values.ctypes.data_as(
      ctypes.POINTER(runtime.as_ctype(np.dtype(np_values.dtype))))
  indices = np_indices.ctypes.data_as(ctypes.POINTER(ctypes.c_ulonglong))

  perm = np_perm.ctypes.data_as(ctypes.POINTER(ctypes.c_ulonglong))
  sparse = np_sparse.ctypes.data_as(ctypes.POINTER(ctypes.c_uint8))

  convert_to = _get_support_func_locator()(np_values.dtype.type)[0]
  ptr = convert_to(rank, nse, shape, values, indices, perm, sparse)
  assert ptr is not None, "Problem with calling convertToMLIRSparseTensorF64"
  return ptr


def compile_and_build_engine(
    module: ir.Module) -> execution_engine.ExecutionEngine:
  """Compiles an MLIR module and builds a JIT execution engine.

  Args:
    module: The MLIR module.

  Returns:
    A JIT execution engine for the MLIR module.

  """
  return _get_sparse_compiler().compile_and_jit(module)


class _SparseTensorDescriptor(ctypes.Structure):
  """A C structure for an MLIR sparse tensor."""
  _fields_ = [
      # A pointer for the MLIR sparse tensor storage.
      ("storage", ctypes.POINTER(ctypes.c_ulonglong)),
      # An MLIR MemRef descriptor for the shape of the sparse tensor.
      ("shape", runtime.make_nd_memref_descriptor(1, ctypes.c_ulonglong)),
  ]


def _output_one_dim(dim: int, rank: int, shape: str, type: str) -> str:
  """Produces the MLIR text code to output the size for the given dimension."""
  return f"""
  %c{dim} = arith.constant {dim} : index
  %d{dim} = tensor.dim %t, %c{dim} : tensor<{shape}x{type}, #enc>
  memref.store %d{dim}, %b[%c{dim}] : memref<{rank}xindex>
"""


# TODO: With better support from MLIR, we may improve the current implementation
# by doing the following:
# (1) Use Python code to generate the kernel instead of doing MLIR text code
#     stitching.
# (2) Use scf.for instead of an unrolled loop to write out the dimension sizes
#     when tensor.dim supports non-constant dimension value.
def _get_create_sparse_tensor_kernel(
    sparsity_codes: Sequence[sparse_tensor.DimLevelType], type: str) -> str:
  """Creates an MLIR text kernel to contruct a sparse tensor from a file.

  The kernel returns a _SparseTensorDescriptor structure.
  """
  rank = len(sparsity_codes)

  # Use ? to represent a dimension in the dynamic shape string representation.
  shape = "x".join(map(lambda d: "?", range(rank)))

  # Convert the encoded sparsity values to a string representation.
  sparsity = ", ".join(
      map(lambda s: '"compressed"' if s.value else '"dense"', sparsity_codes))

  # Get the MLIR text code to write the dimension sizes to the output buffer.
  output_dims = "\n".join(
      map(lambda d: _output_one_dim(d, rank, shape, type), range(rank)))

  # Return the MLIR text kernel.
  return f"""
!Ptr = !llvm.ptr<i8>
#enc = #sparse_tensor.encoding<{{
  dimLevelType = [ {sparsity} ]
}}>
func.func @{_ENTRY_NAME}(%filename: !Ptr) -> (tensor<{shape}x{type}, #enc>, memref<{rank}xindex>)
attributes {{ llvm.emit_c_interface }} {{
  %t = sparse_tensor.new %filename : !Ptr to tensor<{shape}x{type}, #enc>
  %b = memref.alloc() : memref<{rank}xindex>
  {output_dims}
  return %t, %b : tensor<{shape}x{type}, #enc>, memref<{rank}xindex>
}}"""


def create_sparse_tensor(filename: str,
                         sparsity: Sequence[sparse_tensor.DimLevelType],
                         type: str) -> Tuple[ctypes.c_void_p, np.ndarray]:
  """Creates an MLIR sparse tensor from the input file.

  Args:
    filename: A string for the name of the file that contains the tensor data in
      a COO-flavored format.
    sparsity: A sequence of DimLevelType values, one for each dimension of the
      tensor.

  Returns:
    A Tuple containing the following values:
    storage: A ctypes.c_void_p for the MLIR sparse tensor storage.
    shape: A 1D numpy array of integers, for the shape of the tensor.

  Raises:
    OSError: If there is any problem in loading the supporting C shared library.
    ValueError:  If the shared library doesn't contain the needed routine.
  """
  with ir.Context() as ctx, ir.Location.unknown():
    module = _get_create_sparse_tensor_kernel(sparsity, type)
    module = ir.Module.parse(module)
    engine = compile_and_build_engine(module)

  # A sparse tensor descriptor to receive the kernel result.
  c_tensor_desc = _SparseTensorDescriptor()
  # Convert the filename to a byte stream.
  c_filename = ctypes.c_char_p(bytes(filename, "utf-8"))

  arg_pointers = [
      ctypes.byref(ctypes.pointer(c_tensor_desc)),
      ctypes.byref(c_filename)
  ]

  # Invoke the execution engine to run the module and return the result.
  engine.invoke(_ENTRY_NAME, *arg_pointers)
  shape = runtime.ranked_memref_to_numpy(ctypes.pointer(c_tensor_desc.shape))
  return c_tensor_desc.storage, shape


# TODO: With better support from MLIR, we may improve the current implementation
# by using Python code to generate the kernel instead of doing MLIR text code
# stitching.
def _get_output_sparse_tensor_kernel(
        sparsity_codes: Sequence[sparse_tensor.DimLevelType],
        type: str) -> str:
  """Creates an MLIR text kernel to output a sparse tensor to a file.

  The kernel returns void.
  """
  rank = len(sparsity_codes)

  # Use ? to represent a dimension in the dynamic shape string representation.
  shape = "x".join(map(lambda d: "?", range(rank)))

  # Convert the encoded sparsity values to a string representation.
  sparsity = ", ".join(
      map(lambda s: '"compressed"'
          if s.value else '"dense"', sparsity_codes))

  # Return the MLIR text kernel.
  return f"""
!Ptr = !llvm.ptr<i8>
#enc = #sparse_tensor.encoding<{{
  dimLevelType = [ {sparsity} ]
}}>
func.func @{_ENTRY_NAME}(%t: tensor<{shape}x{type}, #enc>, %filename: !Ptr)
attributes {{ llvm.emit_c_interface }} {{
  sparse_tensor.out %t, %filename : tensor<{shape}x{type}, #enc>, !Ptr
  func.return
}}"""


def output_sparse_tensor(tensor: ctypes.c_void_p, filename: str,
                         sparsity: Sequence[sparse_tensor.DimLevelType],
                         type: str) -> None:
  """Outputs an MLIR sparse tensor to the given file.

  Args:
    tensor: A C pointer to the MLIR sparse tensor.
    filename: A string for the name of the file that contains the tensor data in
      a COO-flavored format.
    sparsity: A sequence of DimLevelType values, one for each dimension of the
      tensor.
    type: The MLIR string for the data type.

  Raises:
    OSError: If there is any problem in loading the supporting C shared library.
    ValueError:  If the shared library doesn't contain the needed routine.
  """
  with ir.Context() as ctx, ir.Location.unknown():
    module = _get_output_sparse_tensor_kernel(sparsity, type)
    module = ir.Module.parse(module)
    engine = compile_and_build_engine(module)

  # Convert the filename to a byte stream.
  c_filename = ctypes.c_char_p(bytes(filename, "utf-8"))

  arg_pointers = [
      ctypes.byref(ctypes.cast(tensor, ctypes.c_void_p)),
      ctypes.byref(c_filename)
  ]

  # Invoke the execution engine to run the module and return the result.
  engine.invoke(_ENTRY_NAME, *arg_pointers)