forked from tensorflow/tensorflow
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathkernel.cc
More file actions
95 lines (75 loc) · 2.97 KB
/
kernel.cc
File metadata and controls
95 lines (75 loc) · 2.97 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
// Implementation of the pointer-to-implementation wrapper for the data-parallel
// kernel abstraction. KernelBase just delegates to the internal
// platform-specific implementation instance.
#include "tensorflow/stream_executor/kernel.h"
#include "tensorflow/stream_executor/platform/port.h"
#include "tensorflow/stream_executor/lib/demangle.h"
#include "tensorflow/stream_executor/platform.h"
#include "tensorflow/stream_executor/platform/logging.h"
#include "tensorflow/stream_executor/stream_executor.h"
#include "tensorflow/stream_executor/stream_executor_internal.h"
namespace perftools {
namespace gputools {
bool KernelMetadata::registers_per_thread(int *registers_per_thread) const {
if (has_registers_per_thread_) {
*registers_per_thread = registers_per_thread_;
return true;
}
return false;
}
void KernelMetadata::set_registers_per_thread(int registers_per_thread) {
registers_per_thread_ = registers_per_thread;
has_registers_per_thread_ = true;
}
bool KernelMetadata::shared_memory_bytes(int *shared_memory_bytes) const {
if (has_shared_memory_bytes_) {
*shared_memory_bytes = shared_memory_bytes_;
return true;
}
return false;
}
void KernelMetadata::set_shared_memory_bytes(int shared_memory_bytes) {
shared_memory_bytes_ = shared_memory_bytes;
has_shared_memory_bytes_ = true;
}
static internal::KernelInterface *KernelImplementationFromPlatformKind(
PlatformKind platform_kind) {
if (platform_kind == PlatformKind::kCuda) {
return (*internal::MakeCUDAKernelImplementation())();
} else if (platform_kind == PlatformKind::kOpenCL ||
platform_kind == PlatformKind::kOpenCLAltera) {
return (*internal::MakeOpenCLKernelImplementation())();
} else {
LOG(FATAL) << "cannot create kernel implementation for platform kind: "
<< PlatformKindString(platform_kind);
}
}
KernelBase::KernelBase(StreamExecutor *parent)
: implementation_(
KernelImplementationFromPlatformKind(parent->platform_kind())),
parent_(parent) {
DCHECK(parent_ != nullptr);
}
KernelBase::KernelBase(StreamExecutor *parent,
internal::KernelInterface *implementation)
: implementation_(implementation), parent_(parent) {}
KernelBase::~KernelBase() {}
unsigned KernelBase::Arity() const { return implementation_->Arity(); }
void KernelBase::SetPreferredCacheConfig(KernelCacheConfig config) {
return implementation_->SetPreferredCacheConfig(config);
}
KernelCacheConfig KernelBase::GetPreferredCacheConfig() const {
return implementation_->GetPreferredCacheConfig();
}
// Prefix stub functions emitted by the CUDA splitter.
static const char *kStubPrefix = "__device_stub_";
void KernelBase::set_name(port::StringPiece name) {
name_ = name.ToString();
port::StringPiece stubless_name = name;
if (name.starts_with(kStubPrefix)) {
stubless_name.remove_prefix(strlen(kStubPrefix));
}
demangled_name_ = port::Demangle(stubless_name.data());
}
} // namespace gputools
} // namespace perftools