full GPU support (#142)

* initial GPU support * ensure tests only try to use CUDA if a device exists * more stringent xgboost_jll compat requirement * think this is done * use gpu_hist by default if GPU arrays are used * oops... that was sneaky * bump version * Update docs/src/features.md Co-authored-by: Rik Huijzer <[email protected]> * Update docs/src/features.md Co-authored-by: Rik Huijzer <[email protected]> * Update docs/src/features.md Co-authored-by: Rik Huijzer <[email protected]> * Update docs/src/index.md Co-authored-by: Rik Huijzer <[email protected]> * update confusing comment Co-authored-by: Rik Huijzer <[email protected]>
dmlc · Dec 28, 2022 · d030ece · d030ece
1 parent f060536
commit d030ece
Show file tree

Hide file tree

Showing 10 changed files with 152 additions and 16 deletions.
diff --git a/Project.toml b/Project.toml
@@ -1,10 +1,11 @@
 name = "XGBoost"
 uuid = "009559a3-9522-5dbb-924b-0b6ed2b22bb9"
-version = "2.1.1"
+version = "2.2.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 CEnum = "fa961155-64e5-5f13-b03f-caf6b980ea82"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 JSON3 = "0f8b85d8-7281-11e9-16c2-39a750bddbf1"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
@@ -24,7 +25,7 @@ OrderedCollections = "1"
 SparseMatricesCSR = "0.6"
 Tables = "1"
 Term = "1"
-XGBoost_jll = "1.7"
+XGBoost_jll = "1.7.2"
 julia = "1.6"
 
 [extras]

diff --git a/docs/src/api.md b/docs/src/api.md
@@ -22,6 +22,7 @@ slice
 nrows
 ncols
 size(::DMatrix)
+isgpu
 getlabel
 getweights
 setfeatureinfo!

diff --git a/docs/src/features.md b/docs/src/features.md
@@ -177,3 +177,39 @@ Each of these merely returns a `NamedTuple` which can be used to supply keyword
 xgboost(X, y, 1; countregression()..., randomforest()..., num_parallel_tree=12)
 ```
 will fit a random forest according to a Poisson likelihood fit with 12 trees.
+
+
+## GPU Support
+XGBoost supports GPU-assisted training on Nvidia GPU's with CUDA via
+[CUDA.jl](https://github.com/JuliaGPU/CUDA.jl).  To utilize the GPU, one has to construct a
+`DMatrix` object from GPU arrays.  There are two ways of doing this:
+- Pass a `CuArray` as the training matrix (conventionally `X`, the first argument to `DMatrix`).
+- Pass a table with *all* columns as `CuVector`s.
+
+You can check whether a `DMatrix` can use the GPU with [`XGBoost.isgpu`](@ref).
+
+The target or label data does not need to be a `CuArray`.
+
+It is not necessary to create an explicit `DMatrix` to use GPU features, one can pass the data
+normally directly to `xgboost` or `Booster`, as long as that data consists of `CuArray`s.
+
+!!! note
+
+    The `tree_method` parameter to `Booster` has special handling.  If `nothing`, it will use `libxgboost`
+    defaults as per the documentation, unless a GPU array is given in which case it will default to
+    `gpu_hist`.  An explicitly set value will override this.
+
+### Example
+```julia
+X = cu(randn(1000, 3))
+y = randn(1000)
+
+dm = DMatrix(X, y)
+XGBoost.isgpu(dm)  # true
+
+X = (x1=cu(randn(1000)), x2=cu(randn(1000)))
+dm = DMatrix(X, y)
+XGBoost.isgpu(dm)  # true
+
+xgboost((X, y), num_rounds=10)  # no need to use `DMatrix`
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -137,6 +137,12 @@ Keyword arguments to `Booster` are xgboost model parameters.  These are describe
 they are described in the main xgbosot documentation (in a few cases such as Greek letters we also
 allow unicode equivalents).
 
+!!! note
+
+    The `tree_method` parameter has special handling.  If `nothing`, it will use `libxgboost`
+    defaults as per the documentation, unless a GPU array is input in which case it will default to
+    `gpu_hist`.  An explicitly set value will override this.
+
 ### Training
 `Booster` objects can be trained with [`update!`](@ref).
 ```julia

diff --git a/src/XGBoost.jl b/src/XGBoost.jl
@@ -10,6 +10,7 @@ using OrderedCollections
 using JSON3
 using Tables
 using Term
+using CUDA
 using Statistics: mean, std
 
 using Base: @propagate_inbounds

diff --git a/src/booster.jl b/src/booster.jl
@@ -33,6 +33,9 @@ see [here](https://xgboost.readthedocs.io/en/stable/parameter.html) for a compre
 Both parameter names and their values must be provided exactly as they appear in the linked
 documentation.  Model parameters can also be set after construction, see [`setparam!`](@ref) and
 [`setparams!`](@ref).
+- `tree_method`: This parameter gets special handling.  By default it is `nothing` which uses the default
+    from `libxgboost` as per the documentation unless GPU arrays are used in which case it defaults to
+    `"gpu_hist"`.  If an explicit option is set, it will always be used.
 - `feature_names`: Sets the feature names of training data.  This will use the feature names set in the
     input data if available (e.g. if tabular data was passed this will use column names).
 - `model_buffer`: A buffer (`AbstractVector{UInt8}` or `IO`) from which to load an existing booster
@@ -89,6 +92,7 @@ function Booster(cache::AbstractVector{<:DMatrix};
                  feature_names::AbstractVector{<:AbstractString}=getfeaturenames(cache),
                  model_buffer=UInt8[],
                  model_file::AbstractString="",
+                 tree_method::Union{Nothing,AbstractString}=nothing,
                  kw...
                 )
     o = Ref{BoosterHandle}()
@@ -99,7 +103,13 @@ function Booster(cache::AbstractVector{<:DMatrix};
     elseif !isempty(model_file)
         load!(b, model_file)
     end
-    setparams!(b; kw...)
+    # automatically use gpu_hist if CuArrays used and we didn't pass an explicit argument
+    tm = if isnothing(tree_method)
+        (!isempty(cache) && all(isgpu, cache)) ? (tree_method="gpu_hist",) : (;)
+    else
+        (tree_method=tree_method,)
+    end
+    setparams!(b; tm..., kw...)
     b
 end
 Booster(dm::DMatrix; kw...) = Booster([dm]; kw...)

diff --git a/src/cuda.jl b/src/cuda.jl
@@ -0,0 +1 @@
+
diff --git a/src/dmatrix.jl b/src/dmatrix.jl
@@ -79,11 +79,15 @@ mutable struct DMatrix <: AbstractMatrix{Union{Float32,Missing}}
     # this is not allocated on initialization because it's not needed for any core functionality
     data::Union{Nothing,SparseMatrixCSR{0,Float32,UInt64}}
 
+    # whether the DMatrix was initialized via GPU methods
+    is_gpu::Bool
+
     function DMatrix(handle::Ptr{Nothing};
                      feature_names::AbstractVector{<:AbstractString}=String[],
+                     is_gpu::Bool=false,
                      kw...
                     )
-        dm = new(handle, nothing)
+        dm = new(handle, nothing, is_gpu)
         setinfos!(dm; kw...)
         isempty(feature_names) || setfeaturenames!(dm, feature_names)
         finalizer(x -> xgbcall(XGDMatrixFree, x.handle), dm)
@@ -100,7 +104,13 @@ function _setinfo!(dm::DMatrix, name::AbstractString, info::AbstractVector{<:Int
     info
 end
 
+"""
+    isgpu(dm::DMatrix)
 
+Whether or not the `DMatrix` data was initialized for a GPU.  Boosters trained on such data utilize the GPU
+for training.
+"""
+isgpu(dm::DMatrix) = dm.is_gpu
 
 """
     setinfo!(dm::DMatrix, name, info)
@@ -174,6 +184,25 @@ function _dmatrix(x::AbstractMatrix{T}; missing_value::Float32=NaN32, kw...) whe
     DMatrix(o[]; kw...)
 end
 
+# sadly we have to copy CuArray because of incompatible column convention
+function _transposed_cuda_dmatrix(x::CuArray{T}; missing_value::Float32=NaN32, kw...) where {T<:Real}
+    o = Ref{DMatrixHandle}()
+    cfg = "{\"missing\": $missing_value}"
+    GC.@preserve x begin
+        info = numpy_json_info(x)
+        xgbcall(XGDMatrixCreateFromCudaArrayInterface, info, cfg, o)
+    end
+    DMatrix(o[]; is_gpu=true, kw...)
+end
+
+DMatrix(x::Transpose{T,<:CuArray}; kw...) where {T<:Real} = _transposed_cuda_dmatrix(parent(x); kw...)
+DMatrix(x::Adjoint{T,<:CuArray}; kw...) where {T<:Real} = _transposed_cuda_dmatrix(parent(x); kw...)
+
+function DMatrix(x::CuArray; kw...)
+    x′ = CuArray(transpose(x))
+    _transposed_cuda_dmatrix(x′; kw...)
+end
+
 function DMatrix(x::AbstractMatrix{T}; kw...) where {T<:Real}
     # sadly, this copying is unavoidable
     _dmatrix(convert(Matrix{Float32}, transpose(x)); kw...)
@@ -241,14 +270,35 @@ DMatrix(Xy::Tuple; kw...) = DMatrix(Xy[1], Xy[2]; kw...)
 
 DMatrix(dm::DMatrix) = dm
 
+function _check_gpu_table(tbl)
+    cols = Tables.Columns(tbl)
+    isgpu = all(x -> x isa CuArray, cols)
+    (isgpu, cols)
+end
+
+function _dmatrix_gpu_table(cols::Tables.Columns; missing_value::Float32=NaN32, kw...)
+    o = Ref{DMatrixHandle}()
+    cfg = "{\"missing\": $missing_value}"
+    GC.@preserve cols begin
+        infos = numpy_json_infos(cols)
+        xgbcall(XGDMatrixCreateFromCudaColumnar, infos, cfg, o)
+    end
+    DMatrix(o[]; is_gpu=true, kw...)
+end
+
 function DMatrix(tbl;
                  feature_names::AbstractVector{<:AbstractString}=collect(string.(Tables.columnnames(tbl))),
                  kw...
                 )
     if !Tables.istable(tbl)
         throw(ArgumentError("DMatrix requires either an AbstractMatrix or table satisfying the Tables.jl interface"))
     end
-    DMatrix(Tables.matrix(tbl); feature_names, kw...)
+    (isgpu, cols) = _check_gpu_table(tbl)
+    if isgpu
+        _dmatrix_gpu_table(cols; feature_names, kw...)
+    else
+        DMatrix(Tables.matrix(tbl); feature_names, kw...)
+    end
 end
 
 DMatrix(tbl, y::AbstractVector; kw...) = DMatrix(tbl; label=y, kw...)
@@ -336,7 +386,7 @@ hasdata(dm::DMatrix) = !isnothing(dm.data)
 
 @propagate_inbounds function Base.getindex(dm::DMatrix, idx...)
     hasdata(dm) || getdata!(dm)
-    @inbounds getvalue(dm.data, idx..., missing)
+    @inbounds getvalue(dm.data, CartesianIndex(idx...), missing)
 end
 
 """
@@ -435,15 +485,21 @@ _numpy_json_typestr(::Type{<:Complex{<:AbstractFloat}}) = "c"
 
 numpy_json_typestr(::Type{T}) where {T<:Number} = string("<",_numpy_json_typestr(T),sizeof(T))
 
-function numpy_json_info(x::AbstractMatrix; read_only::Bool=false)
-    info = Dict("data"=>(convert(Csize_t, pointer(x)), read_only),
-                "shape"=>reverse(size(x)),
-                "typestr"=>numpy_json_typestr(eltype(x)),
-                "version"=>3,
-               )
-    JSON3.write(info)
+# pointer(x) should return the proper pointer even for CuArray
+numpy_array_pointer(x::AbstractArray) = convert(Csize_t, pointer(x))
+
+function numpy_json_dict(x::AbstractArray; read_only::Bool=false)
+    Dict("data"=>(numpy_array_pointer(x), read_only),
+         "shape"=>reverse(size(x)),
+         "typestr"=>numpy_json_typestr(eltype(x)),
+         "version"=>3,
+        )
 end
 
+numpy_json_info(x::AbstractArray; kw...) = JSON3.write(numpy_json_dict(x; kw...))
+
+numpy_json_infos(cols::Tables.Columns; kw...) = JSON3.write(map(x -> numpy_json_dict(x; kw...), cols))
+
 #TODO: still a little worried about ownership here
 #TODO: sparse data for iterator and proper missings handling
 

diff --git a/src/show.jl b/src/show.jl
@@ -21,12 +21,14 @@ function Base.show(io::IO, mime::MIME"text/plain", dm::DMatrix)
                context=:compact=>true,
               )
     end
+    subtitle = "(nrows=$(nrows(dm)), ncols=$(ncols(dm)))"
+    isgpu(dm) && (subtitle *= " {bold green}(GPU){/bold green}")
     p = Panel(_features_display_string(getfeaturenames(dm), size(dm,2)),
-              str,
+              str;
               style="magenta",
               title="XGBoost.DMatrix",
               title_style="bold cyan",
-              subtitle="(nrows=$(nrows(dm)), ncols=$(ncols(dm)))",
+              subtitle,
               subtitle_style="blue",
              )
     show(io, mime, p)

diff --git a/test/runtests.jl b/test/runtests.jl
@@ -1,4 +1,5 @@
 using XGBoost
+using CUDA: has_cuda, cu
 using Random, SparseArrays
 using Test
 
@@ -164,7 +165,7 @@ end
     dtrain = XGBoost.load(DMatrix, testfilepath("agaricus.txt.train"))
     dtest = XGBoost.load(DMatrix, testfilepath("agaricus.txt.test"))
 
-    model_file, _ = mktemp()
+    (model_file, _) = mktemp()
 
     bst = xgboost(dtrain, num_round=5,
                   η=1.0, max_depth=2,
@@ -199,5 +200,26 @@ end
     @test preds == predict(bst2, dtest)
 end
 
+has_cuda() && @testset "cuda" begin
+    X = randn(Float32, 4, 5)
+    dm = DMatrix(cu(X))
+    @test size(dm) == size(X)
+    @test XGBoost.isgpu(dm)
+    @test dm == Matrix(X)
+
+    X = randn(Float32, 4, 5)
+    dm = DMatrix(cu(X)')
+    @test size(dm) == size(X')
+    @test XGBoost.isgpu(dm)
+    @test dm == Matrix(X')
+
+    X₀ = randn(Float32, 100, 3)
+    X = (x1=cu(X₀[:,1]), x2=cu(X₀[:,2]), x3=cu(X₀[:,3]))
+    dm = DMatrix(X)
+    @test size(dm) == size(X₀)
+    @test XGBoost.isgpu(dm)
+    @test dm == X₀
+end
+
 
 end