diff options
| author | CoprDistGit <infra@openeuler.org> | 2025-11-11 12:34:01 +0000 |
|---|---|---|
| committer | CoprDistGit <infra@openeuler.org> | 2025-11-11 12:34:01 +0000 |
| commit | 2b940cc82a7c6490b3ca84d2981ba9afaac7d4ea (patch) | |
| tree | c612dcc6e6a0ea24adf0ff4bcdd0be97d7d9a53f | |
| parent | d93a9b4610b60112b64caa0f699b7e8a5ca370f9 (diff) | |
automatic import of ANNC
| -rw-r--r-- | .gitignore | 5 | ||||
| -rw-r--r-- | 0001-fix-pattern-conflicts.patch | 72 | ||||
| -rw-r--r-- | 0002-Add-graph-optimizer-and-embedding_fused-kernels.patch | 4297 | ||||
| -rw-r--r-- | ANNC.spec | 83 | ||||
| -rw-r--r-- | sources | 9 | ||||
| -rw-r--r-- | x86_64_external_files.patch | 132 |
6 files changed, 4511 insertions, 87 deletions
@@ -2,3 +2,8 @@ /annc_external.tar.gz.aa /annc_external.tar.gz.ab /ANNC-v0.0.2.tar.gz +/XNNPACK.tar.gz +/external.tar.gz.aa +/external.tar.gz.ab +/external.tar.gz.ac +/v3.2.tar.gz diff --git a/0001-fix-pattern-conflicts.patch b/0001-fix-pattern-conflicts.patch new file mode 100644 index 0000000..461a7e6 --- /dev/null +++ b/0001-fix-pattern-conflicts.patch @@ -0,0 +1,72 @@ +From d941fd6d6893abd1893d2da25afa7847ec32f132 Mon Sep 17 00:00:00 2001 +From: zhengchenhui <zhengchenhui1@huawei.com> +Date: Tue, 9 Sep 2025 15:27:43 +0800 +Subject: [PATCH 1/2] fix pattern conflicts. + +--- + python/annc/optimize/graph.py | 6 ++++++ + python/annc/optimize/rewriter.py | 17 ++++++++++------- + python/setup.py | 1 - + 3 files changed, 16 insertions(+), 8 deletions(-) + +diff --git a/python/annc/optimize/graph.py b/python/annc/optimize/graph.py +index dae7d6d..1a97d1b 100644 +--- a/python/annc/optimize/graph.py ++++ b/python/annc/optimize/graph.py +@@ -255,6 +255,12 @@ class Graph: + + self.versions['producer'] = graph_def.versions.producer + ++ def check_node_exist(self, node: Node) -> bool: ++ for i, n in enumerate(self.nodes): ++ if n.name == node.name: ++ return True ++ return False ++ + def get_node(self, name: str) -> Node: + for node in self.nodes: + if node.name == name: +diff --git a/python/annc/optimize/rewriter.py b/python/annc/optimize/rewriter.py +index 2e66189..c0f2894 100644 +--- a/python/annc/optimize/rewriter.py ++++ b/python/annc/optimize/rewriter.py +@@ -46,15 +46,18 @@ class BaseRewriter(ABC): + :param users: expected list of user types and names + :param expected_num_users: expected number of users (optional) + """ +- if user_num is not None and len(node.users) != user_num: +- raise CheckFailed +- if len(users) > len(node.users): +- raise CheckFailed +- for i, (type, name) in enumerate(users): +- if type is not None and node.users[i].type != type: ++ if self.graph.check_node_exist(node): ++ if user_num is not None and len(node.users) != user_num: + raise CheckFailed +- if name is not None and node.users[i].name != name: ++ if len(users) > len(node.users): + raise CheckFailed ++ for i, (type, name) in enumerate(users): ++ if type is not None and node.users[i].type != type: ++ raise CheckFailed ++ if name is not None and node.users[i].name != name: ++ raise CheckFailed ++ else: ++ raise CheckFailed + + def check_operands(self, + node: Node, +diff --git a/python/setup.py b/python/setup.py +index 1197745..d32026e 100644 +--- a/python/setup.py ++++ b/python/setup.py +@@ -34,6 +34,5 @@ setup(name=project, + entry_points={ + 'console_scripts': [ + f'{project}-opt = {project}.main:opt', +- f'{project}-apply-tf = scripts.install:tf_install', + ], + }) +-- +2.33.0 + diff --git a/0002-Add-graph-optimizer-and-embedding_fused-kernels.patch b/0002-Add-graph-optimizer-and-embedding_fused-kernels.patch new file mode 100644 index 0000000..4fd5abd --- /dev/null +++ b/0002-Add-graph-optimizer-and-embedding_fused-kernels.patch @@ -0,0 +1,4297 @@ +From 90ac46231be919e1c07b7d41bc0a8c4b1f1ba41a Mon Sep 17 00:00:00 2001 +From: zhengchenhui <zhengchenhui1@huawei.com> +Date: Mon, 10 Nov 2025 16:49:10 +0800 +Subject: [PATCH] Add graph optimizer and embedding_fused kernels. + +--- + annc/tensorflow/graph_optimizer/BUILD | 32 + + annc/tensorflow/graph_optimizer/graph_opt.cc | 738 ++++++++++++++++++ + annc/tensorflow/graph_optimizer/graph_opt.h | 165 ++++ + .../embedding_fused_action_id_gather.cc | 144 ++++ + .../embedding_fused_action_id_gather_test.cc | 289 +++++++ + .../kernels/embedding_fused_gather.cc | 90 +++ + .../kernels/embedding_fused_gather_test.cc | 186 +++++ + .../kernels/embedding_fused_padding.cc | 126 +++ + .../kernels/embedding_fused_padding_test.cc | 307 ++++++++ + .../embedding_fused_sparse_dynamic_stitch.cc | 87 +++ + ...edding_fused_sparse_dynamic_stitch_test.cc | 108 +++ + .../kernels/embedding_fused_sparse_reshape.cc | 195 +++++ + .../embedding_fused_sparse_reshape_test.cc | 281 +++++++ + .../embedding_fused_sparse_segment_reduce.cc | 165 ++++ + ...ing_fused_sparse_segment_reduce_nonzero.cc | 159 ++++ + ...used_sparse_segment_reduce_nonzero_test.cc | 183 +++++ + ...edding_fused_sparse_segment_reduce_test.cc | 205 +++++ + .../kernels/embedding_fused_sparse_select.cc | 111 +++ + .../embedding_fused_sparse_select_test.cc | 182 +++++ + annc/tensorflow/ops/embedding_fused_ops.cc | 133 ++++ + annc/tensorflow/tf_annc_optimizer.patch | 221 ++++++ + 21 files changed, 4107 insertions(+) + create mode 100644 annc/tensorflow/graph_optimizer/BUILD + create mode 100644 annc/tensorflow/graph_optimizer/graph_opt.cc + create mode 100644 annc/tensorflow/graph_optimizer/graph_opt.h + create mode 100644 annc/tensorflow/kernels/embedding_fused_action_id_gather.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_action_id_gather_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_gather.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_gather_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_padding.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_padding_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_reshape.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_reshape_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_test.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_select.cc + create mode 100644 annc/tensorflow/kernels/embedding_fused_sparse_select_test.cc + create mode 100644 annc/tensorflow/ops/embedding_fused_ops.cc + create mode 100644 annc/tensorflow/tf_annc_optimizer.patch + +diff --git a/annc/tensorflow/graph_optimizer/BUILD b/annc/tensorflow/graph_optimizer/BUILD +new file mode 100644 +index 0000000..39a37f8 +--- /dev/null ++++ b/annc/tensorflow/graph_optimizer/BUILD +@@ -0,0 +1,32 @@ ++package( ++ default_visibility = [ ++ "//visibility:public", ++ ], ++ licenses = ["notice"], ++) ++ ++cc_library( ++ name = "annc_graph_opt", ++ srcs = glob(["*.cc"]), ++ hdrs = glob(["*.h"]), ++ linkstatic = True, ++ alwayslink = True, ++ visibility = ["//visibility:public"], ++ deps = [ ++ "//tensorflow/core/grappler:graph_view", ++ "//tensorflow/core/grappler:grappler_item", ++ "//tensorflow/core/grappler:op_types", ++ ], ++) ++ ++cc_binary( ++ name = "libannc_graph_opt.so", ++ srcs = glob(["*.cc", "*.h"]), ++ linkshared = True, ++ visibility = ["//visibility:public"], ++ deps = [ ++ "//tensorflow/core/grappler:graph_view", ++ "//tensorflow/core/grappler:grappler_item", ++ "//tensorflow/core/grappler:op_types", ++ ], ++) +\ No newline at end of file +diff --git a/annc/tensorflow/graph_optimizer/graph_opt.cc b/annc/tensorflow/graph_optimizer/graph_opt.cc +new file mode 100644 +index 0000000..9c74489 +--- /dev/null ++++ b/annc/tensorflow/graph_optimizer/graph_opt.cc +@@ -0,0 +1,738 @@ ++#include "graph_opt.h" ++ ++using namespace tensorflow; ++using namespace tensorflow::grappler; ++ ++namespace annc { ++void update_node_indexes(const GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) { ++ for (int i = 0; i < graph->node_size(); ++i) { ++ node_indexes[graph->node(i).name()] = i; ++ } ++} ++ ++void GraphOptimizer::register_rewriter( ++ std::unique_ptr<PatternRewriter> rewriter) { ++ rewriters_.push_back(std::move(rewriter)); ++} ++ ++void GraphOptimizer::optimize() { ++ update_node_indexes(graph_, node_indexes_); ++ int node_index = 0; ++ const int node_size = graph_->node_size(); ++ while (node_index < node_size) { ++ const NodeDef& node = graph_->node(node_index); ++ const std::string& node_name = node.name(); ++ for (auto& rewriter : rewriters_) { ++ if (rewriter->match_and_rewrite(&node, graph_, node_indexes_)) { ++ update_node_indexes(graph_, node_indexes_); ++ const std::string new_node_name = node_name + fusion_appendix; ++ node_index = node_indexes_.at(new_node_name); ++ break; ++ } ++ } ++ node_index++; ++ } ++} ++ ++std::string get_node_name(const std::string& name) { ++ size_t colon_pos = name.find_last_of(':'); ++ std::string node_name = name; ++ if (colon_pos != std::string::npos) { ++ node_name = name.substr(0, colon_pos); ++ } ++ return node_name; ++} ++ ++void set_fusedop_attributes(NodeDef* fused, ++ const absl::Span<const absl::string_view> fused_ops, ++ int num_args = 1, float epsilon = 0.0) { ++ auto* attr = fused->mutable_attr(); ++ SetAttrValue(fused_ops, &(*attr)["fused_ops"]); ++ SetAttrValue(num_args, &(*attr)["num_args"]); ++ SetAttrValue(epsilon, &(*attr)["epsilon"]); // required only for BatchNorm ++} ++ ++const NodeDef* PatternRewriter::get_node(const std::string& name) { ++ const std::string node_name = get_node_name(name); ++ const int node_index = indexes_->at(node_name); ++ return &graph_->node(node_index); ++} ++ ++NodeDef* PatternRewriter::get_mutable_node(const std::string& name) { ++ const std::string node_name = get_node_name(name); ++ if (indexes_->find(node_name) == indexes_->end()) return nullptr; ++ const int node_index = indexes_->at(node_name); ++ return graph_->mutable_node(node_index); ++} ++ ++NodeDef* PatternRewriter::get_operand(const NodeDef* node, ++ std::string op_type) { ++ for (int i = 0; i < node->input_size(); ++i) { ++ NodeDef* operand = get_mutable_node(node->input(i)); ++ if (operand != nullptr && operand->op() == op_type) return operand; ++ } ++ return nullptr; ++} ++ ++const NodeDef* PatternRewriter::get_user(const NodeDef* node, int index, ++ const std::string& op_type) { ++ std::string node_name = node->name(); ++ if (index) std::string node_name = node_name + ":" + std::to_string(index); ++ for (int i = 0; i < graph_->node_size(); ++i) { ++ const NodeDef* node = graph_->mutable_node(i); ++ for (int j = 0; j < node->input_size(); ++j) { ++ if (node->input(j) == node_name && node->op() == op_type) { ++ return node; ++ } ++ } ++ } ++ return nullptr; ++} ++ ++void PatternRewriter::replace_all_users_with(const NodeDef* old_node, ++ int old_index, ++ const NodeDef* new_node, ++ int new_index, GraphDef* graph) { ++ std::string old_name = old_node->name(); ++ if (old_index) old_name = old_name + ":" + std::to_string(old_index); ++ std::string new_name = new_node->name(); ++ if (new_index) new_name = new_name + ":" + std::to_string(new_index); ++ for (int i = 0; i < graph->node_size(); ++i) { ++ NodeDef* node = graph->mutable_node(i); ++ for (int j = 0; j < node->input_size(); ++j) { ++ if (node->input(j) == old_name) { ++ node->set_input(j, new_name); ++ } ++ } ++ } ++} ++ ++class KPFusedSparseDynamicStitchRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedSparseDynamicStitch"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(node->op() == "ParallelDynamicStitch" && ++ node->input_size() % 2 == 0) ++ int num_inputs = node->input_size(); ++ int num_partitions = num_inputs / 2; ++ // left branch ++ const NodeDef* partition = get_node(node->input(0)); ++ CHECK_NODE_OK(partition->op() == "DynamicPartition" && ++ partition->input_size() == 2) ++ const NodeDef* range = get_node(partition->input(0)); ++ CHECK_NODE_OK(range->op() == "Range" && range->input_size() == 3) ++ // Range start=0, delta=1 ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(range->input(0)), {0})) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(range->input(2)), {1})) ++ const NodeDef* size = get_node(range->input(1)); ++ CHECK_NODE_OK(IsSize(*size) && size->input_size() == 1) ++ const NodeDef* cast = get_node(partition->input(1)); ++ CHECK_NODE_OK(IsCast(*cast) && cast->input_size() == 1) ++ const NodeDef* floor_mod = get_node(cast->input(0)); ++ CHECK_NODE_OK(floor_mod->op() == "FloorMod" && floor_mod->input_size() == 2) ++ CHECK_NODE_OK(check_const_value<int64_t>( ++ get_mutable_node(floor_mod->input(1)), {num_partitions})) ++ ++ CHECK_NODE_OK(check_int_attr(node, "N", {num_partitions})) ++ CHECK_NODE_OK(check_int_attr(partition, "num_partitions", {num_partitions})) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(size->input(0)); ++ // right branch ++ for (int i = num_partitions; i < num_inputs; ++i) { ++ const NodeDef* gather = get_node(node->input(i)); ++ CHECK_NODE_OK(gather->op() == "GatherV2" && gather->input_size() == 3) ++ // Gather axis=0 ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(gather->input(2)), {0})) ++ const NodeDef* partition_1 = get_node(gather->input(1)); ++ CHECK_NODE_OK(partition_1->op() == "DynamicPartition" && ++ partition_1->input_size() == 2) ++ CHECK_NODE_OK( ++ check_int_attr(partition_1, "num_partitions", {num_partitions})) ++ const NodeDef* floor_div = get_node(partition_1->input(0)); ++ CHECK_NODE_OK(floor_div->op() == "FloorDiv" && ++ floor_div->input_size() == 2) ++ CHECK_NODE_OK(check_const_value<int64_t>( ++ get_mutable_node(floor_div->input(1)), {num_partitions})) ++ fused_node->add_input(gather->input(0)); ++ } ++ (*fused_node->mutable_attr())["N"].set_i(num_partitions); ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(node, 0, fused_node, 0, graph); ++ return true; ++ } ++}; ++ ++class KPFusedSparseSegmentReduceRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedSparseSegmentReduce"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(IsStridedSlice(*node) && node->input_size() == 4) ++ const NodeDef* shape = get_node(node->input(0)); ++ CHECK_NODE_OK(IsShape(*shape) && shape->input_size() == 1) ++ const NodeDef* ss_reduce = get_node(shape->input(0)); ++ CHECK_NODE_OK(ss_reduce->input_size() == 3) ++ AttrValue combiner; ++ if (ss_reduce->op() == "SparseSegmentMean") ++ combiner.set_i(1); ++ else if (ss_reduce->op() == "SparseSegmentSum") ++ combiner.set_i(0); ++ else ++ return false; ++ const NodeDef* cast = get_node(ss_reduce->input(2)); ++ CHECK_NODE_OK(IsCast(*cast) && cast->input_size() == 1) ++ const NodeDef* strided_slice = get_node(cast->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice) && ++ strided_slice->input_size() == 4) ++ ++ // check fusion conditions ++ CHECK_NODE_OK( ++ check_const_shape(get_mutable_node(strided_slice->input(1)), {2})) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "shrink_axis_mask", 2)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "begin_mask", 1)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "end_mask", 1)) ++ CHECK_NODE_OK(check_const_shape(get_mutable_node(node->input(1)), {1})) ++ CHECK_NODE_OK(check_int_attr(node, "shrink_axis_mask", 1)) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(ss_reduce->input(0)); ++ fused_node->add_input(ss_reduce->input(1)); ++ fused_node->add_input(strided_slice->input(0)); ++ fused_node->add_input(strided_slice->input(1)); ++ fused_node->add_input(node->input(1)); ++ AddNodeAttr("combiner", combiner, fused_node); ++ ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(ss_reduce, 0, fused_node, 0, graph); ++ replace_all_users_with(node, 0, fused_node, 1, graph); ++ return true; ++ } ++}; ++ ++class KPFusedSparseSegmentReduceNonzeroRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { ++ return "KPFusedSparseSegmentReduceNonzero"; ++ } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(node->op() == "GatherND" && ++ node->input_size() == 2) // output:2 ++ const NodeDef* ss_reduce = get_node(node->input(0)); ++ CHECK_NODE_OK(ss_reduce->input_size() == 3) ++ AttrValue combiner; ++ if (ss_reduce->op() == "SparseSegmentMean") { ++ combiner.set_i(1); ++ } else if (ss_reduce->op() == "SparseSegmentSum") { ++ combiner.set_i(0); ++ } else { ++ return false; ++ } ++ const NodeDef* where = get_node(node->input(1)); ++ CHECK_NODE_OK(where->op() == "Where" && where->input_size() == 1) ++ const NodeDef* cast = get_user(where, 0, "Cast"); ++ CHECK_NODE_OK(cast != nullptr) // output: 1 ++ const NodeDef* notequal = get_node(where->input(0)); ++ CHECK_NODE_OK(IsNotEqual(*notequal) && notequal->input_size() == 2); ++ const NodeDef* zerolike = get_node(notequal->input(1)); ++ CHECK_NODE_OK(IsZerosLike(*zerolike) && zerolike->input_size() == 1) ++ const NodeDef* cast_1 = get_node(ss_reduce->input(2)); ++ CHECK_NODE_OK(IsCast(*cast_1) && cast_1->input_size() == 1) ++ const NodeDef* strided_slice = get_node(cast->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice) && ++ strided_slice->input_size() == 4) ++ const NodeDef* shape = get_user(ss_reduce, 0, "Shape"); ++ CHECK_NODE_OK(shape != nullptr) ++ const NodeDef* cast_2 = get_user(shape, 0, "Cast"); // output: 0 ++ CHECK_NODE_OK(cast_2 != nullptr) ++ ++ CHECK_NODE_OK( ++ check_const_shape(get_mutable_node(strided_slice->input(1)), {2})) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "shrink_axis_mask", 2)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "begin_mask", 1)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "end_mask", 1)) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(ss_reduce->input(0)); ++ fused_node->add_input(ss_reduce->input(1)); ++ fused_node->add_input(strided_slice->input(0)); ++ fused_node->add_input(strided_slice->input(1)); ++ AddNodeAttr("combiner", combiner, fused_node); ++ ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name() << "\n"; ++ replace_all_users_with(cast_2, 0, fused_node, 0, graph); ++ replace_all_users_with(cast, 0, fused_node, 1, graph); ++ replace_all_users_with(node, 0, fused_node, 2, graph); ++ return true; ++ } ++}; ++ ++class KPFusedEmbeddingPaddingRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedEmbeddingPadding"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(IsReshape(*node)) ++ const NodeDef* user = get_user(node, 0, "ConcatV2"); ++ CHECK_NODE_OK(user != nullptr) ++ const NodeDef* concat = get_node(node->input(0)); ++ CHECK_NODE_OK(IsConcat(*concat) && concat->input_size() == 3) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(concat->input(2)), {0})) ++ const NodeDef* fill = get_node(concat->input(1)); ++ CHECK_NODE_OK(IsFill(*fill) && fill->input_size() == 2) ++ NodeDef* pack = get_operand(fill, "Pack"); ++ CHECK_NODE_OK(pack != nullptr && IsPack(*pack) && pack->input_size() == 2) ++ NodeDef* fill_const = get_operand(fill, "Const"); ++ CHECK_NODE_OK(fill_const != nullptr && ++ check_const_value<int>(fill_const, {0})) ++ NodeDef* sub = get_operand(pack, "Sub"); ++ CHECK_NODE_OK(sub != nullptr && IsSub(*sub) && sub->input_size() == 2) ++ const NodeDef* strided_slice = get_node(sub->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice) && ++ strided_slice->input_size() == 4) ++ const NodeDef* cast = get_node(strided_slice->input(0)); ++ CHECK_NODE_OK(IsCast(*cast)) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(cast->input(0)); ++ fused_node->add_input(concat->input(0)); ++ fused_node->add_input(sub->input(1)); ++ fused_node->add_input(node->input(1)); ++ const NodeDef* pack_left = get_node(pack->input(0)); ++ const NodeDef* pack_right = get_node(pack->input(1)); ++ if (IsConstant(*pack_left) || IsHostConstant(*pack_left)) { ++ fused_node->add_input(pack->input(0)); ++ } else if (IsConstant(*pack_right) || IsHostConstant(*pack_right)) { ++ fused_node->add_input(pack->input(1)); ++ } else { ++ return false; ++ } ++ ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(sub, 0, fused_node, 0, graph); ++ replace_all_users_with(node, 0, fused_node, 1, graph); ++ return true; ++ } ++}; ++ ++class KPFusedEmbeddingPaddingFastRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedEmbeddingPaddingFast"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(IsStridedSlice(*node) && node->input_size() == 4) ++ CHECK_NODE_OK(check_const_shape(get_mutable_node(node->input(1)), {0})) ++ CHECK_NODE_OK(check_const_shape(get_mutable_node(node->input(2)), {1})) ++ CHECK_NODE_OK(check_const_shape(get_mutable_node(node->input(3)), {1})) ++ CHECK_NODE_OK(check_int_attr(node, "shrink_axis_mask", 1)) ++ const NodeDef* shape = get_node(node->input(0)); ++ CHECK_NODE_OK(IsShape(*shape) && shape->input_size() == 1) ++ const NodeDef* reshape = get_node(shape->input(0)); ++ CHECK_NODE_OK(IsReshape(*reshape) && reshape->input_size() == 2) ++ const NodeDef* concat = get_node(reshape->input(0)); ++ CHECK_NODE_OK(IsConcat(*concat) && concat->input_size() == 3) ++ const NodeDef* fill = get_node(concat->input(1)); ++ CHECK_NODE_OK(IsFill(*fill) && fill->input_size() == 2) ++ const NodeDef* pack = get_node(fill->input(0)); ++ CHECK_NODE_OK(IsPack(*pack) && pack->input_size() == 2) ++ const NodeDef* sub = get_node(pack->input(0)); ++ CHECK_NODE_OK(IsSub(*sub) && sub->input_size() == 2) ++ const NodeDef* strided_slice = get_node(sub->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice) && ++ strided_slice->input_size() == 4) ++ const NodeDef* cast = get_node(strided_slice->input(0)); ++ CHECK_NODE_OK(IsCast(*cast)) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(cast->input(0)); ++ fused_node->add_input(concat->input(0)); ++ fused_node->add_input(sub->input(1)); ++ fused_node->add_input(reshape->input(1)); ++ const NodeDef* pack_left = get_node(pack->input(0)); ++ const NodeDef* pack_right = get_node(pack->input(1)); ++ if (IsConstant(*pack_left) || IsHostConstant(*pack_left)) { ++ fused_node->add_input(pack->input(0)); ++ } else if (IsConstant(*pack_right) || IsHostConstant(*pack_right)) { ++ fused_node->add_input(pack->input(1)); ++ } else { ++ return false; ++ } ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(sub, 0, fused_node, 0, graph); ++ replace_all_users_with(node, 0, fused_node, 1, graph); ++ return true; ++ } ++}; ++ ++class KPFusedSparseSelectRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedSparseSelect"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(IsConcat(*node) && node->input_size() == 3) ++ const NodeDef* select_0 = get_node(node->input(0)); ++ CHECK_NODE_OK(IsSelect(*select_0) && select_0->input_size() == 3) ++ const NodeDef* select_1 = get_node(select_0->input(2)); ++ CHECK_NODE_OK(IsSelect(*select_1) && select_1->input_size() == 3) ++ const NodeDef* fill = get_node(select_1->input(1)); ++ CHECK_NODE_OK(IsFill(*fill) && fill->input_size() == 2) ++ CHECK_NODE_OK( ++ check_const_value<float>(get_mutable_node(fill->input(1)), {1.0f})) ++ const NodeDef* cast = get_node(select_1->input(2)); ++ CHECK_NODE_OK(IsCast(*cast) && cast->input_size() == 1) ++ const NodeDef* equal = get_node(select_1->input(0)); ++ CHECK_NODE_OK(IsEqual(*equal) && equal->input_size() == 2) ++ NodeDef* reshape = get_operand(equal, "Reshape"); ++ CHECK_NODE_OK(reshape != nullptr && IsReshape(*reshape)) ++ const NodeDef* greater = get_node(cast->input(0)); ++ CHECK_NODE_OK(IsGreater(*greater) && greater->input_size() == 2) ++ const NodeDef* reshape_4 = get_node(greater->input(0)); ++ CHECK_NODE_OK(IsReshape(*reshape_4) && reshape_4->input_size() == 2) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(reshape_4->input(1)), {-1, 1})) ++ const NodeDef* equal_1 = get_node(select_0->input(0)); ++ CHECK_NODE_OK(IsEqual(*equal_1) && equal_1->input_size() == 2) ++ NodeDef* reshape_1 = get_operand(equal_1, "Reshape"); ++ CHECK_NODE_OK(reshape_1 != nullptr && IsReshape(*reshape_1)) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(reshape_1->input(1)), {-1, 1})) ++ ++ // right branch ++ const NodeDef* select_2 = get_node(node->input(1)); ++ CHECK_NODE_OK(IsSelect(*select_2) && select_2->input_size() == 3) ++ const NodeDef* equal_2 = get_node(select_2->input(0)); ++ CHECK_NODE_OK(IsEqual(*equal_2) && equal_2->input_size() == 2) ++ const NodeDef* fill_1 = get_node(select_2->input(2)); ++ CHECK_NODE_OK(IsFill(*fill_1) && fill_1->input_size() == 2) ++ CHECK_NODE_OK( ++ check_const_value<float>(get_mutable_node(fill_1->input(1)), {1.0f})) ++ const NodeDef* reshape_2 = get_operand(equal_2, "Reshape"); ++ CHECK_NODE_OK(reshape_2 != nullptr && IsReshape(*reshape_2)) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(reshape_2->input(1)), {-1, 1})) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(reshape->input(0)); ++ fused_node->add_input(reshape_1->input(0)); ++ fused_node->add_input(reshape_2->input(0)); ++ std::vector<const NodeDef*> const_inputs = {equal, equal_1, equal_2, ++ greater}; ++ for (const NodeDef* const_node : const_inputs) { ++ const NodeDef* left = get_node(const_node->input(0)); ++ const NodeDef* right = get_node(const_node->input(1)); ++ if (IsConstant(*left) || IsHostConstant(*left)) { ++ fused_node->add_input(const_node->input(0)); ++ } else if (IsConstant(*right) || IsHostConstant(*right)) { ++ fused_node->add_input(const_node->input(1)); ++ } else { ++ return false; ++ } ++ } ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(reshape, 0, fused_node, 0, graph); ++ replace_all_users_with(select_0, 0, fused_node, 1, graph); ++ replace_all_users_with(node, 0, fused_node, 2, graph); ++ return true; ++ } ++}; ++ ++class KPFusedGatherRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedGather"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(node->op() == "GatherV2" && node->input_size() == 3) ++ CHECK_NODE_OK(check_const_value<int>(get_mutable_node(node->input(2)), {0})) ++ const NodeDef* gather = get_node(node->input(0)); ++ CHECK_NODE_OK(gather->op() == "GatherV2" && ++ gather->input_size() == 3) // input:0 ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(gather->input(2)), {0})) ++ const NodeDef* unique = get_node(node->input(1)); // output:1 ++ CHECK_NODE_OK(unique->op() == "Unique" && unique->input_size() == 1) ++ const NodeDef* unique_1 = get_node(unique->input(0)); ++ CHECK_NODE_OK(unique_1->op() == "Unique" && unique_1->input_size() == 1) ++ const NodeDef* strided_slice = get_node(unique_1->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice)) // input:1 2 ++ CHECK_NODE_OK(check_int_attr(strided_slice, "shrink_axis_mask", 2)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "begin_mask", 1)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "end_mask", 1)) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(gather->input(0)); ++ fused_node->add_input(strided_slice->input(0)); ++ fused_node->add_input(strided_slice->input(1)); ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(unique_1, 0, fused_node, 0, graph); ++ replace_all_users_with(unique_1, 1, fused_node, 1, graph); ++ replace_all_users_with(node, 0, fused_node, 2, graph); ++ return true; ++ } ++}; ++ ++class KPFusedSparseReshapeRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedSparseReshape"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(node->op() == "SparseReshape" && node->input_size() == 3) ++ const NodeDef* concat = get_node(node->input(0)); ++ CHECK_NODE_OK(IsConcat(*concat) && concat->input_size() == 3) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(concat->input(2)), {-1})) ++ const NodeDef* reshape = get_node(concat->input(1)); ++ CHECK_NODE_OK(IsReshape(*reshape) && reshape->input_size() == 2) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(reshape->input(1)), {-1, 1})) ++ const NodeDef* strided_slice = get_node(reshape->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice) && ++ strided_slice->input_size() == 4) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "shrink_axis_mask", 2)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "begin_mask", 1)) ++ CHECK_NODE_OK(check_int_attr(strided_slice, "end_mask", 1)) ++ const NodeDef* cast_1 = get_node(concat->input(0)); ++ CHECK_NODE_OK(IsCast(*cast_1) && cast_1->input_size() == 1) ++ const NodeDef* reshape_1 = get_node(cast_1->input(0)); ++ CHECK_NODE_OK(IsReshape(*reshape_1) && reshape_1->input_size() == 2) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(reshape_1->input(1)), {-1, 1})) ++ const NodeDef* range = get_node(reshape_1->input(0)); ++ CHECK_NODE_OK(range->op() == "Range" && range->input_size() == 3) ++ // Range start=0, delta=1 ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(range->input(0)), {0})) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(range->input(2)), {1})) ++ const NodeDef* cast = get_node(node->input(1)); ++ CHECK_NODE_OK(IsCast(*cast) && cast->input_size() == 1) ++ const NodeDef* pack = get_node(cast->input(0)); ++ CHECK_NODE_OK(IsPack(*pack) && pack->input_size() == 2) ++ const NodeDef* strided_slice_1 = get_node(pack->input(0)); ++ CHECK_NODE_OK(IsStridedSlice(*strided_slice_1) && ++ strided_slice_1->input_size() == 4) ++ CHECK_NODE_OK(check_const_value<int>( ++ get_mutable_node(strided_slice_1->input(1)), {0})) ++ CHECK_NODE_OK(check_const_value<int>( ++ get_mutable_node(strided_slice_1->input(2)), {1})) ++ CHECK_NODE_OK(check_const_value<int>( ++ get_mutable_node(strided_slice_1->input(3)), {1})) ++ CHECK_NODE_OK(check_int_attr(strided_slice_1, "shrink_axis_mask", 1)) ++ const NodeDef* shape = get_node(strided_slice_1->input(0)); ++ CHECK_NODE_OK(IsShape(*shape) && shape->input_size() == 1) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(shape->input(0)); ++ fused_node->add_input(strided_slice->input(1)); ++ fused_node->add_input(node->input(2)); ++ fused_node->add_input(pack->input(1)); ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(node, 0, fused_node, 0, graph); ++ replace_all_users_with(node, 1, fused_node, 1, graph); ++ return true; ++ } ++}; ++ ++class KPFusedEmbeddingActionIdGatherRewriter : public PatternRewriter { ++ public: ++ std::string name() const override { return "KPFusedEmbeddingActionIdGather"; } ++ ++ bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) override { ++ graph_ = graph; ++ indexes_ = &node_indexes; ++ CHECK_NODE_OK(IsConcat(*node) && node->input_size() == 3) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(node->input(2)), {-1})) ++ const NodeDef* reshape = get_node(node->input(0)); ++ CHECK_NODE_OK(IsReshape(*reshape) && reshape->input_size() == 2) ++ const NodeDef* gather = get_node(reshape->input(0)); ++ const NodeDef* pack_1 = get_node(reshape->input(1)); ++ CHECK_NODE_OK(IsPack(*pack_1) && pack_1->input_size() == 2) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(pack_1->input(1)), {-1})) ++ CHECK_NODE_OK(gather->op() == "GatherV2" && gather->input_size() == 3) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(gather->input(2)), {0})) ++ const NodeDef* gather_1 = get_node(gather->input(0)); ++ CHECK_NODE_OK(gather_1->op() == "GatherV2" && gather_1->input_size() == 3) ++ CHECK_NODE_OK( ++ check_const_value<int>(get_mutable_node(gather_1->input(2)), {0})) ++ const NodeDef* fill = get_node(node->input(1)); ++ CHECK_NODE_OK(IsFill(*fill) && fill->input_size() == 2) ++ CHECK_NODE_OK(check_const_value<int>(get_mutable_node(fill->input(1)), {0})) ++ const NodeDef* pack = get_node(fill->input(0)); ++ CHECK_NODE_OK(IsPack(*pack) && pack->input_size() == 2) ++ ++ auto nodes = graph->mutable_node(); ++ NodeDef* fused_node = nodes->Add(); ++ fused_node->set_name(node->name() + fusion_appendix); ++ fused_node->set_op(name()); ++ fused_node->set_device(node->device()); ++ fused_node->add_input(gather_1->input(1)); ++ fused_node->add_input(gather_1->input(0)); ++ fused_node->add_input(gather->input(1)); ++ fused_node->add_input(pack->input(0)); ++ fused_node->add_input(pack->input(1)); ++ nodes->SwapElements(node_indexes.at(node->name()), nodes->size() - 1); ++ ++ VLOG(0) << "-- Add node: [" << fused_node->op() << "] " ++ << fused_node->name(); ++ replace_all_users_with(node, 0, fused_node, 0, graph); ++ return true; ++ } ++}; ++ ++void run_graph_optimization(GraphDef* graph) { ++ GraphOptimizer optimizer(graph); ++ ++ const char* annc_fused_all = getenv("ANNC_FUSED_ALL"); ++ const char* annc_fused_sps_stitch = getenv("ANNC_FUSED_SPS_STITCH"); ++ const char* annc_fused_sps_reduce = getenv("ANNC_FUSED_SPS_REDUCE"); ++ const char* annc_fused_emb_padding = getenv("ANNC_FUSED_EMD_PADDING"); ++ const char* annc_fused_emb_padding_fast = ++ getenv("ANNC_FUSED_EMD_PADDING_FAST"); ++ const char* annc_fused_sps_select = getenv("ANNC_FUSED_SPS_SELECT"); ++ const char* annc_fused_gather = getenv("ANNC_FUSED_GATHER"); ++ const char* annc_fused_sps_reshape = getenv("ANNC_FUSED_SPS_RESHAPE"); ++ const char* annc_fused_emb_actionid_gather = ++ getenv("ANNC_FUSED_EMB_ACTIONID_GATHER"); ++ const char* annc_fused_sps_reduce_nonzero = ++ getenv("ANNC_FUSED_SPS_REDUCE_NONZERO"); ++ ++ bool enable_all = ++ (annc_fused_all != nullptr) && strcmp(annc_fused_all, "1") == 0; ++ ++ // default enable all rewriters ++ if (enable_all || (annc_fused_sps_stitch != nullptr && ++ strcmp(annc_fused_sps_stitch, "1") == 0)) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedSparseDynamicStitchRewriter>()); ++ if (enable_all || (annc_fused_sps_reduce != nullptr && ++ strcmp(annc_fused_sps_reduce, "1") == 0)) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedSparseSegmentReduceRewriter>()); ++ if (enable_all || (annc_fused_emb_padding_fast != nullptr && ++ strcmp(annc_fused_emb_padding_fast, "1") == 0)) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedEmbeddingPaddingFastRewriter>()); ++ if (enable_all || (annc_fused_emb_padding != nullptr && ++ strcmp(annc_fused_emb_padding, "1") == 0)) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedEmbeddingPaddingRewriter>()); ++ if (enable_all || (annc_fused_sps_select != nullptr && ++ strcmp(annc_fused_sps_select, "1") == 0)) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedSparseSelectRewriter>()); ++ if (enable_all || ++ (annc_fused_gather != nullptr && strcmp(annc_fused_gather, "1") == 0)) ++ optimizer.register_rewriter(std::make_unique<KPFusedGatherRewriter>()); ++ if (enable_all || (annc_fused_sps_reshape != nullptr && ++ strcmp(annc_fused_sps_reshape, "1") == 0)) ++ optimizer.register_rewriter(std::make_unique<KPFusedSparseReshapeRewriter>()); ++ if (annc_fused_emb_actionid_gather != nullptr && ++ strcmp(annc_fused_emb_actionid_gather, "1") == 0) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedEmbeddingActionIdGatherRewriter>()); ++ if (annc_fused_sps_reduce_nonzero != nullptr && ++ strcmp(annc_fused_sps_reduce_nonzero, "1") == 0) ++ optimizer.register_rewriter( ++ std::make_unique<KPFusedSparseSegmentReduceNonzeroRewriter>()); ++ optimizer.optimize(); ++} ++} // namespace annc +diff --git a/annc/tensorflow/graph_optimizer/graph_opt.h b/annc/tensorflow/graph_optimizer/graph_opt.h +new file mode 100644 +index 0000000..0ac0273 +--- /dev/null ++++ b/annc/tensorflow/graph_optimizer/graph_opt.h +@@ -0,0 +1,165 @@ ++#ifndef ANNC_TF_GRAPH_OPT_H_ ++#define ANNC_TF_GRAPH_OPT_H_ ++#include <type_traits> ++#include <unordered_map> ++ ++#include "tensorflow/core/grappler/graph_view.h" ++#include "tensorflow/core/grappler/grappler_item.h" ++#include "tensorflow/core/grappler/op_types.h" ++ ++using namespace tensorflow; ++using namespace tensorflow::grappler; ++ ++namespace annc { ++#define CHECK_NODE_OK(x) \ ++ if (!(x)) { \ ++ return false; \ ++ } ++ ++static const std::string fusion_appendix = "/kp_fused"; ++ ++void update_node_indexes(const GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes); ++ ++class PatternRewriter { ++ public: ++ PatternRewriter() {} ++ virtual ~PatternRewriter() = default; ++ ++ virtual bool match_and_rewrite( ++ const NodeDef* node, GraphDef* graph, ++ std::unordered_map<std::string, int>& node_indexes) = 0; ++ ++ virtual std::string name() const { return "PatternRewriter"; }; ++ ++ const NodeDef* get_node(const std::string& name); ++ NodeDef* get_mutable_node(const std::string& name); ++ ++ NodeDef* get_operand(const NodeDef* node, std::string op_type); ++ ++ const NodeDef* get_user(const NodeDef* node, int index, ++ const std::string& op_type); ++ ++ void replace_all_users_with(const NodeDef* old_node, int old_index, ++ const NodeDef* new_node, int new_index, ++ GraphDef* graph); ++ ++ bool check_input_dims(NodeDef* op, const std::string& output_name, ++ int dim_size) { ++ if (op->attr().count("_output_shapes")) { ++ int pos_index = 0; ++ size_t pos = output_name.find_last_of(':'); ++ if (pos != std::string::npos) { ++ pos_index = std::stoi(output_name.substr(pos + 1)); ++ } ++ const TensorShapeProto& shape = ++ op->attr().at("_output_shapes").list().shape(pos_index); ++ if (shape.dim_size() == dim_size) return true; ++ } else if (op->attr().count("shape")) { ++ const TensorShapeProto& shape = op->attr().at("shape").shape(); ++ if (shape.dim_size() == dim_size) return true; ++ } ++ return false; ++ } ++ ++ bool check_const_dims(NodeDef* op, int dim_size) { ++ if (!((IsConstant(*op) || IsHostConstant(*op)) && ++ HasNodeAttr(*op, "value"))) ++ return false; ++ ++ TensorProto* tensor = (*op->mutable_attr())["value"].mutable_tensor(); ++ const auto& shape = tensor->tensor_shape(); ++ if (shape.dim_size() != static_cast<int>(dim_size)) return false; ++ return true; ++ } ++ ++ bool check_const_shape(NodeDef* op, std::vector<int> dims) { ++ if (!((IsConstant(*op) || IsHostConstant(*op)) && ++ HasNodeAttr(*op, "value"))) ++ return false; ++ ++ TensorProto* tensor = (*op->mutable_attr())["value"].mutable_tensor(); ++ const auto& shape = tensor->tensor_shape(); ++ if (shape.dim_size() != static_cast<int>(dims.size())) return false; ++ for (int i = 0; i < shape.dim_size(); ++i) { ++ if (shape.dim(i).size() != dims[i]) return false; ++ } ++ return true; ++ } ++ ++ template <typename T> ++ bool check_const_value(NodeDef* op, std::vector<T> cmp) { ++ if (!((IsConstant(*op) || IsHostConstant(*op)) && ++ HasNodeAttr(*op, "value"))) ++ return false; ++ ++ TensorProto* tensor = (*op->mutable_attr())["value"].mutable_tensor(); ++ const auto& shape = tensor->tensor_shape(); ++ int dim_size = 1; ++ for (int i = 0; i < shape.dim_size(); ++i) { ++ dim_size *= shape.dim(i).size(); ++ } ++ if (dim_size < static_cast<int>(cmp.size())) return false; ++ ++ if (std::is_same<T, float>::value) { ++ const float* data = tensor->mutable_float_val()->data(); ++ if (data == nullptr) ++ data = reinterpret_cast<const float*>(tensor->tensor_content().data()); ++ if (data == nullptr) return false; ++ for (int i = 0; i < static_cast<int>(cmp.size()); ++i) { ++ if (std::fabs(data[i] - cmp[i]) >= 1e-5f) return false; ++ } ++ } else if (std::is_same<T, int>::value) { ++ const int* data = tensor->mutable_int_val()->data(); ++ if (data == nullptr) ++ data = reinterpret_cast<const int*>(tensor->tensor_content().data()); ++ if (data == nullptr) return false; ++ for (int i = 0; i < static_cast<int>(cmp.size()); ++i) { ++ if (data[i] != cmp[i]) return false; ++ } ++ } else if (std::is_same<T, int64_t>::value) { ++ const int64_t* data = tensor->mutable_int64_val()->data(); ++ if (data == nullptr) ++ data = ++ reinterpret_cast<const int64_t*>(tensor->tensor_content().data()); ++ if (data == nullptr) return false; ++ for (int i = 0; i < static_cast<int>(cmp.size()); ++i) { ++ if (data[i] != cmp[i]) return false; ++ } ++ } else { ++ // data type do not support ++ return false; ++ } ++ return true; ++ } ++ ++ bool check_int_attr(const NodeDef* op, std::string name, int value) { ++ if (HasNodeAttr(*op, name)) { ++ AttrValue attr = op->attr().at(name); ++ if (attr.value_case() == AttrValue::kI && attr.i() == value) return true; ++ } ++ return false; ++ } ++ ++ GraphDef* graph_; ++ std::unordered_map<std::string, int>* indexes_; ++}; ++ ++class GraphOptimizer { ++ public: ++ GraphOptimizer(GraphDef* graph) : graph_(graph) {} ++ virtual ~GraphOptimizer() = default; ++ ++ void register_rewriter(std::unique_ptr<PatternRewriter> rewriter); ++ ++ void optimize(); ++ ++ private: ++ GraphDef* graph_; ++ std::unordered_map<std::string, int> node_indexes_; ++ std::vector<std::unique_ptr<PatternRewriter>> rewriters_; ++}; ++ ++void run_graph_optimization(GraphDef* graph); ++} // namespace annc ++#endif // ANNC_TF_GRAPH_OPT_H_ +diff --git a/annc/tensorflow/kernels/embedding_fused_action_id_gather.cc b/annc/tensorflow/kernels/embedding_fused_action_id_gather.cc +new file mode 100644 +index 0000000..db7c92c +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_action_id_gather.cc +@@ -0,0 +1,144 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/util/work_sharder.h" ++ ++namespace tensorflow { ++ ++template <typename Tindices> ++static void GatherV2Impl(OpKernelContext* context, const float* params_data, const TensorShape& params_shape, ++ const Tindices* indices_data, const TensorShape& indices_shape, int axis, Tensor* temp) { ++ TensorShape temp_shape; ++ const int P0 = params_shape.dim_size(0); ++ int P1 = 1; ++ for (int d = 0; d < indices_shape.dims(); ++d) { ++ temp_shape.AddDim(indices_shape.dim_size(d)); ++ } ++ ++ for (int d = 1; d < params_shape.dims(); ++d) { ++ temp_shape.AddDim(params_shape.dim_size(d)); ++ P1 *= params_shape.dim_size(d); ++ } ++ OP_REQUIRES_OK(context, context->allocate_temp(DT_FLOAT, temp_shape, temp)); ++ VLOG(1) << "temp shape: " << temp->shape().DebugString(); ++ ++ const int num_indices = indices_shape.num_elements(); ++ float* temp_data = temp->flat<float>().data(); ++ VLOG(1) << "num_indices : " << num_indices; ++ OP_REQUIRES(context, axis == 0, errors::InvalidArgument("axis only support 0")); ++ const int slice_size = P1; ++ for (int i = 0; i < num_indices; ++i) { ++ Tindices idx = indices_data[i]; ++ OP_REQUIRES(context, (idx >= 0 && idx < P0), errors::InvalidArgument("GatherV2 axis=0: index out of range")); ++ std::memcpy( ++ temp_data + i * slice_size, params_data + idx * slice_size, sizeof(float) * slice_size ++ ); ++ } ++} ++ ++ ++template <typename Tindices1, typename Tindices2> ++class KPFusedEmbeddingActionIdGatherOp : public OpKernel { ++public: ++ explicit KPFusedEmbeddingActionIdGatherOp(OpKernelConstruction* context) : OpKernel(context) {} ++ ++ void Compute(OpKernelContext* context) override { ++ // Grab the input tensor ++ const Tensor& indices1 = context->input(0); ++ const Tensor& params = context->input(1); ++ const Tensor& indices2 = context->input(2); ++ const Tensor& pack_dim = context->input(3); ++ ++ const Tensor& pack = context->input(4); ++ ++ VLOG(1) << "indices1 shape: " << indices1.shape().DebugString(); ++ VLOG(1) << "params shape: " << params.shape().DebugString(); ++ VLOG(1) << "indices2 shape: " << indices2.shape().DebugString(); ++ OP_REQUIRES( ++ context, ++ TensorShapeUtils::IsMatrix(indices1.shape()), ++ errors::InvalidArgument("indices1 dims must = 2") ++ ); ++ OP_REQUIRES( ++ context, ++ TensorShapeUtils::IsMatrix(indices2.shape()), ++ errors::InvalidArgument("indices2 dims must = 2") ++ ); ++ OP_REQUIRES( ++ context, ++ TensorShapeUtils::IsMatrix(params.shape()), ++ errors::InvalidArgument("params dims must = 2") ++ ); ++ OP_REQUIRES( ++ context, ++ TensorShapeUtils::IsScalar(pack_dim.shape()), ++ errors::InvalidArgument("pack_dim is scalar") ++ ); ++ OP_REQUIRES( ++ context, ++ TensorShapeUtils::IsScalar(pack.shape()), ++ errors::InvalidArgument("pack const is scalar") ++ ); ++ ++ Tensor temp; ++ GatherV2Impl<Tindices1>(context, params.flat<float>().data(), params.shape(), indices1.flat<Tindices1>().data(), ++ indices1.shape(), 0, &temp); ++ Tensor temp1; ++ GatherV2Impl<Tindices2>(context, temp.flat<float>().data(), temp.shape(), indices2.flat<Tindices2>().data(), ++ indices2.shape(), 0, &temp1); ++ int pack_size = pack_dim.scalar<int32>()(); ++ int pack_const = pack.scalar<int32>()(); ++ OP_REQUIRES(context, pack_size > 0, errors::InvalidArgument("pack_size must > 0")); ++ int a_reshaped_cols = temp1.NumElements() / pack_size; ++ auto a_reshaped = temp1.shaped<float, 2>({pack_size, a_reshaped_cols}); ++ Tensor* output; ++ int output_cols = a_reshaped_cols + pack_const; ++ OP_REQUIRES_OK(context, ++ context->allocate_output(0, TensorShape({pack_size, output_cols}), &output)); ++ auto a_reshaped_data = a_reshaped.data(); ++ auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); ++ const int64 cost_per_unit = a_reshaped_cols + pack_const; ++ auto work = [&](int64 start_row, int64 end_row) { ++ float* base = output->matrix<float>().data(); ++ for (int64 row = start_row; row < end_row; ++row) { ++ float* dst_row = base + row * (a_reshaped_cols + pack_const); ++ std::memcpy( ++ dst_row, a_reshaped_data + row * a_reshaped_cols, sizeof(float) * a_reshaped_cols ++ ); ++ std::memset( ++ dst_row + a_reshaped_cols, 0, sizeof(float) * pack_const ++ ); ++ } ++ }; ++ Shard(worker_threads->num_threads, worker_threads->workers, pack_size, ++ cost_per_unit, work); ++ } ++}; ++ ++#define REGISTER_CPU_KERNEL(Tindices1, Tindices2) \ ++ REGISTER_KERNEL_BUILDER(Name("KPFusedEmbeddingActionIdGather") \ ++ .Device(DEVICE_CPU) \ ++ .TypeConstraint<Tindices1>("Tindices1") \ ++ .TypeConstraint<Tindices2>("Tindices2"), \ ++ KPFusedEmbeddingActionIdGatherOp<Tindices1, Tindices2>); ++ ++REGISTER_CPU_KERNEL(int64, int32) ++REGISTER_CPU_KERNEL(int32, int32) ++REGISTER_CPU_KERNEL(int64, int64) ++REGISTER_CPU_KERNEL(int32, int64) ++ ++} +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_action_id_gather_test.cc b/annc/tensorflow/kernels/embedding_fused_action_id_gather_test.cc +new file mode 100644 +index 0000000..11067bf +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_action_id_gather_test.cc +@@ -0,0 +1,289 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ * ==============================================================================*/ ++ ++#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" ++#include "tensorflow/core/framework/allocator.h" ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/types.pb.h" ++#include "tensorflow/core/graph/testlib.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/kernels/ops_util.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++#include "tensorflow/core/lib/gtl/array_slice.h" ++#include "tensorflow/core/lib/random/simple_philox.h" ++#include "tensorflow/core/lib/strings/str_util.h" ++#include "tensorflow/core/platform/test.h" ++#include "tensorflow/core/platform/test_benchmark.h" ++ ++namespace tensorflow { ++ ++class KPFusedEmbeddingActionIdGatherTest : public OpsTestBase { ++ protected: ++ void MakeOp(DataType indices1_type, DataType indices2_type) { ++ TF_ASSERT_OK(NodeDefBuilder("fused_embedding_action_id_gather", ++ "KPFusedEmbeddingActionIdGather") ++ .Input(FakeInput(indices1_type)) // indices1 ++ .Input(FakeInput(DT_FLOAT)) // params ++ .Input(FakeInput(indices2_type)) // indices2 ++ .Input(FakeInput(DT_INT32)) // pack_dim ++ .Input(FakeInput(DT_INT32)) // pack ++ .Finalize(node_def())); ++ TF_ASSERT_OK(InitOp()); ++ } ++ ++ template <typename Tindices1, typename Tindices2> ++ Status FeedAndRun(const std::vector<Tindices1>& indices1_data, ++ const TensorShape& indices1_shape, ++ const std::vector<float>& params_data, ++ const TensorShape& params_shape, ++ const std::vector<Tindices2>& indices2_data, ++ const TensorShape& indices2_shape, int pack_dim_value, ++ int pack_value) { ++ inputs_.clear(); ++ input_types_.clear(); ++ ++ MakeOp(DataTypeToEnum<Tindices1>::v(), DataTypeToEnum<Tindices2>::v()); ++ AddInputFromArray<Tindices1>(indices1_shape, indices1_data); ++ AddInputFromArray<float>(params_shape, params_data); ++ AddInputFromArray<Tindices2>(indices2_shape, indices2_data); ++ AddInputFromArray<int32>(TensorShape({}), {pack_dim_value}); ++ AddInputFromArray<int32>(TensorShape({}), {pack_value}); ++ return RunOpKernel(); ++ } ++}; ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, NormalCase) { ++ std::vector<int64> indices1_data = {0, 2}; ++ TensorShape indices1_shape({2, 1}); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ TensorShape params_shape({3, 2}); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ TensorShape indices2_shape({2, 1}); ++ ++ int pack_dim_value = 2; ++ int pack_value = 1; ++ ++ TF_ASSERT_OK((FeedAndRun<int64, int32>( ++ indices1_data, indices1_shape, params_data, params_shape, indices2_data, ++ indices2_shape, pack_dim_value, pack_value))); ++ ++ Tensor expected(allocator(), DT_FLOAT, TensorShape({2, 3})); ++ test::FillValues<float>(&expected, {5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}); ++ test::ExpectTensorNear<float>(expected, *GetOutput(0), 1e-5); ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, DifferentIndexTypes) { ++ // int64int32 ++ { ++ std::vector<int64> indices1 = {0, 2}; ++ std::vector<int32> indices2 = {1, 0}; ++ TF_ASSERT_OK((FeedAndRun<int64, int32>(indices1, {2, 1}, ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, ++ {3, 2}, indices2, {2, 1}, 2, 1))); ++ test::ExpectTensorNear<float>( ++ *GetOutput(0), ++ test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), ++ 1e-5); ++ } ++ ++ // int32int32 ++ { ++ std::vector<int32> indices1 = {0, 2}; ++ std::vector<int32> indices2 = {1, 0}; ++ TF_ASSERT_OK((FeedAndRun<int32, int32>(indices1, {2, 1}, ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, ++ {3, 2}, indices2, {2, 1}, 2, 1))); ++ test::ExpectTensorNear<float>( ++ *GetOutput(0), ++ test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), ++ 1e-5); ++ } ++ ++ // int64int64 ++ { ++ std::vector<int64> indices1 = {0, 2}; ++ std::vector<int64> indices2 = {1, 0}; ++ TF_ASSERT_OK((FeedAndRun<int64, int64>(indices1, {2, 1}, ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, ++ {3, 2}, indices2, {2, 1}, 2, 1))); ++ test::ExpectTensorNear<float>( ++ *GetOutput(0), ++ test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), ++ 1e-5); ++ } ++ ++ // int32int64 ++ { ++ std::vector<int32> indices1 = {0, 2}; ++ std::vector<int64> indices2 = {1, 0}; ++ TF_ASSERT_OK((FeedAndRun<int32, int64>(indices1, {2, 1}, ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}, ++ {3, 2}, indices2, {2, 1}, 2, 1))); ++ test::ExpectTensorNear<float>( ++ *GetOutput(0), ++ test::AsTensor<float>({5.0f, 6.0f, 0.0f, 1.0f, 2.0f, 0.0f}, {2, 3}), ++ 1e-5); ++ } ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidIndices1Dims) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 2}; ++ AddInputFromArray<int64>(TensorShape({2}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ AddInputFromArray<float>(TensorShape({3, 2}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({}), {2}); ++ AddInputFromArray<int32>(TensorShape({}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.ToString(), "indices1 dims must = 2")) << s; ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidIndices2Dims) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 2}; ++ AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ AddInputFromArray<float>(TensorShape({3, 2}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({}), {2}); ++ AddInputFromArray<int32>(TensorShape({}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.ToString(), "indices2 dims must = 2")) << s; ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidParamsDims) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 2}; ++ AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f}; ++ AddInputFromArray<float>(TensorShape({4}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({}), {2}); ++ AddInputFromArray<int32>(TensorShape({}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.ToString(), "params dims must = 2")) << s; ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackDimDims) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 2}; ++ AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ AddInputFromArray<float>(TensorShape({3, 2}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({1}), {2}); ++ AddInputFromArray<int32>(TensorShape({}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.ToString(), "pack_dim is scalar")) << s; ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackDims) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 2}; ++ AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ AddInputFromArray<float>(TensorShape({3, 2}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({}), {2}); ++ AddInputFromArray<int32>(TensorShape({1}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.ToString(), "pack const is scalar")) << s; ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, InvalidPackSize) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 2}; ++ AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ AddInputFromArray<float>(TensorShape({3, 2}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({}), {0}); ++ AddInputFromArray<int32>(TensorShape({}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.ToString(), "pack_size must > 0")) << s; ++} ++ ++TEST_F(KPFusedEmbeddingActionIdGatherTest, IndexOutOfRange) { ++ MakeOp(DT_INT64, DT_INT32); ++ ++ std::vector<int64> indices1_data = {0, 5}; ++ AddInputFromArray<int64>(TensorShape({2, 1}), indices1_data); ++ ++ std::vector<float> params_data = {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}; ++ AddInputFromArray<float>(TensorShape({3, 2}), params_data); ++ ++ std::vector<int32> indices2_data = {1, 0}; ++ AddInputFromArray<int32>(TensorShape({2, 1}), indices2_data); ++ ++ AddInputFromArray<int32>(TensorShape({}), {2}); ++ AddInputFromArray<int32>(TensorShape({}), {1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE( ++ absl::StrContains(s.ToString(), "GatherV2 axis=0: index out of range")) ++ << s; ++} ++ ++} // namespace tensorflow +diff --git a/annc/tensorflow/kernels/embedding_fused_gather.cc b/annc/tensorflow/kernels/embedding_fused_gather.cc +new file mode 100644 +index 0000000..c09d1ce +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_gather.cc +@@ -0,0 +1,90 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++ http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "tensorflow/core/framework/op_kernel.h"
++#include "tensorflow/core/framework/tensor.h"
++#include "tensorflow/core/util/work_sharder.h"
++
++using namespace tensorflow;
++
++class KPFusedGather : public OpKernel {
++ public:
++ explicit KPFusedGather(OpKernelConstruction* context) : OpKernel(context) { }
++
++ void Compute(OpKernelContext* context) override {
++ const Tensor& data = context->input(0);
++ const Tensor& keys = context->input(1);
++ const Tensor& begin = context->input(2);
++ VLOG(1) << "Embedding table size: " << data.shape().DebugString();
++ VLOG(1) << "Input key shape: " << keys.shape().DebugString();
++ VLOG(1) << "Slice begin value: " << begin.DebugString();
++
++ OP_REQUIRES(context,
++ TensorShapeUtils::IsMatrix(keys.shape()),
++ errors::Internal("Input key must be 2D"));
++ OP_REQUIRES(context,
++ TensorShapeUtils::IsMatrix(data.shape()),
++ errors::Internal("Embedding table shape must be 2D"));
++ OP_REQUIRES(context, begin.NumElements() == 2, errors::Internal("begin must be same as keys rank"));
++ int32 col = begin.flat<int32>().data()[1];
++ OP_REQUIRES(context, col < keys.dim_size(1), errors::Internal("slice cols out of keys range"));
++
++ Tensor* out_indices = nullptr;
++ OP_REQUIRES_OK(context,
++ context->allocate_output(
++ 1, TensorShape({static_cast<int32>(keys.dim_size(0))}), &out_indices));
++ int32 *out_indices_data = out_indices->flat<int32>().data();
++
++ auto keys_mat = keys.matrix<int64>();
++ std::vector<int64_t> unique_values;
++ std::unordered_map<int64_t, int32_t> value_to_index;
++ int current_index = 0;
++ for (int64_t i = 0; i < keys.dim_size(0); ++i) {
++ auto it = value_to_index.find(keys_mat(i, col));
++ if (it == value_to_index.end()) {
++ value_to_index[keys_mat(i, col)] = current_index;
++ unique_values.push_back(keys_mat(i, col));
++ out_indices_data[i] = current_index;
++ ++current_index;
++ } else {
++ out_indices_data[i] = it->second;
++ }
++ }
++
++ Tensor* out_unique_value = nullptr;
++ OP_REQUIRES_OK(context,
++ context->allocate_output(
++ 0, TensorShape({static_cast<int32>(unique_values.size())}), &out_unique_value));
++ std::memcpy(out_unique_value->data(), unique_values.data(), unique_values.size() * sizeof(int64_t));
++
++ Tensor* out_data = nullptr;
++ int embedding_dims = data.dim_size(1);
++ OP_REQUIRES_OK(context,
++ context->allocate_output(
++ 2, TensorShape({static_cast<int32>(unique_values.size()), embedding_dims}), &out_data));
++
++ const float *data_mat = data.flat<float>().data();
++ for (int64_t cur_row = 0; cur_row < unique_values.size(); ++cur_row) {
++ int64_t idx = unique_values[cur_row];
++ OP_REQUIRES(context, idx < data.dim_size(0), errors::Internal("idx out of table range"));
++ const float* src = data_mat + idx * embedding_dims;
++ float* dst = out_data->flat<float>().data() + cur_row * embedding_dims;
++ std::memcpy(dst, src, embedding_dims * sizeof(float));
++ }
++ }
++};
++
++REGISTER_KERNEL_BUILDER(Name("KPFusedGather").Device(DEVICE_CPU),
++ KPFusedGather); +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_gather_test.cc b/annc/tensorflow/kernels/embedding_fused_gather_test.cc +new file mode 100644 +index 0000000..c947709 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_gather_test.cc +@@ -0,0 +1,186 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/tensor_testutil.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++ ++namespace { ++using tensorflow::AllocatorAttributes; ++using tensorflow::DT_FLOAT; ++using tensorflow::DT_INT32; ++using tensorflow::DT_INT64; ++using tensorflow::int64; ++using tensorflow::int32; ++using tensorflow::NodeDefBuilder; ++using tensorflow::OpsTestBase; ++using tensorflow::Status; ++using tensorflow::Tensor; ++using tensorflow::TensorShape; ++using tensorflow::test::ExpectClose; ++using tensorflow::test::FillValues; ++using tensorflow::test::AsTensor; ++using tensorflow::test::ExpectTensorEqual; ++ ++class KPFusedGatherTest : public OpsTestBase { ++ protected: ++ void RunValidCase(const TensorShape& data_shape, ++ const TensorShape& slice_shape, ++ const std::vector<int32>& begin_val, ++ const std::vector<int64>& slice_data, ++ const std::vector<float>& data_data, ++ const std::vector<int64>& expected_unique, ++ const std::vector<int32>& expected_indices, ++ const std::vector<float>& expected_output_data) { ++ TF_EXPECT_OK(NodeDefBuilder("kp_fused_gather", "KPFusedGather") ++ .Input(FakeInput(DT_FLOAT)) ++ .Input(FakeInput(DT_INT64)) ++ .Input(FakeInput(DT_INT32)) ++ .Finalize(node_def())); ++ TF_EXPECT_OK(InitOp()); ++ ++ AddInputFromArray<float>(data_shape, data_data); ++ AddInputFromArray<int64>(slice_shape, slice_data); ++ AddInputFromArray<int32>(TensorShape({2}), begin_val); ++ ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ const Tensor& out_unique = *GetOutput(0); ++ const Tensor& out_indices = *GetOutput(1); ++ const Tensor& out_data = *GetOutput(2); ++ ++ // 验证输出0: unique_values ++ Tensor expected_unique_tensor( ++ allocator(), DT_INT64, ++ TensorShape({static_cast<int64>(expected_unique.size())}) ++ ); ++ FillValues<int64>(&expected_unique_tensor, expected_unique); ++ ExpectTensorEqual<int64>(expected_unique_tensor, out_unique); ++ ++ // 验证输出1: indices ++ Tensor expected_indices_tensor( ++ allocator(), DT_INT32, ++ TensorShape({static_cast<int64_t>(expected_indices.size())}) ++ ); ++ FillValues<int32>(&expected_indices_tensor, expected_indices); ++ ExpectTensorEqual<int32>(expected_indices_tensor, out_indices); ++ ++ // 验证输出2: out_data ++ Tensor expected_data_tensor(allocator(), DT_FLOAT, ++ TensorShape({static_cast<int64>(expected_unique.size()), 12})); ++ FillValues<float>(&expected_data_tensor, expected_output_data); ++ ExpectClose(expected_data_tensor, out_data); // float 用 ExpectClose ++ } ++ ++ Status RunOpExpectFailure(const TensorShape& data_shape, ++ const TensorShape& slice_shape, ++ const std::vector<int32>& begin_val, ++ const std::vector<int64>& slice_data, ++ const std::vector<float>& data_data) { ++ TF_CHECK_OK(NodeDefBuilder("kp_fused_gather", "KPFusedGather") ++ .Input(FakeInput(DT_FLOAT)) ++ .Input(FakeInput(DT_INT64)) ++ .Input(FakeInput(DT_INT32)) ++ .Finalize(node_def())); ++ TF_CHECK_OK(InitOp()); ++ ++ AddInputFromArray<float>(data_shape, data_data); ++ AddInputFromArray<int64>(slice_shape, slice_data); ++ AddInputFromArray<int32>(TensorShape({2}), begin_val); ++ ++ return RunOpKernel(); ++ } ++}; ++ ++// 正向测试:正常输入 ++TEST_F(KPFusedGatherTest, Valid_NormalInput) { ++ RunValidCase( ++ TensorShape({2, 12}), // data shape ++ TensorShape({4, 3}), // slice_input shape ++ {0, 1}, // begin[1] = 1 → 取第1列 ++ {1, 1, 3, ++ 0, 1, 5, ++ 1, 0, 7, ++ 0, 1, 9}, // slice_input 数据 ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 12.0f, ++ 13.0f, 14.0f, 15.0f, 16.0f, 17.0f, 18.0f, 19.0f, 20.0f, 21.0f, 22.0f, 23.0f, 24.0f}, ++ {1, 0}, // unique values from col=1 ++ {0, 0, 1, 0}, // indices mapping ++ {13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, // data[1] ++ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12} // data[0] ++ ); ++} ++ ++// data不是2维 ++TEST_F(KPFusedGatherTest, Invalid_DataDimsNot2) { ++ std::vector<float> data = {1.0f, 2.0f, 3.0f, 4.0f}; ++ Status s = RunOpExpectFailure( ++ TensorShape({4}), // data 不是二维 ++ TensorShape({2, 2}), ++ {0, 0}, ++ {0, 1, 2, 3}, ++ data ++ ); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Embedding table shape must be 2D")); ++} ++ ++// key 不是2维 ++TEST_F(KPFusedGatherTest, Invalid_SliceInputDimsNot2) { ++ std::vector<float> data(2 * 12, 1.0f); ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 12}), ++ TensorShape({4}), // 1D slice_input ++ {0, 0}, ++ {0, 1, 2, 3}, ++ data ++ ); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Input key must be 2D")); ++} ++ ++// begin[1] 超出列范围 ++TEST_F(KPFusedGatherTest, Invalid_BeginColOutOfRange) { ++ std::vector<float> data(2 * 12, 1.0f); ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 12}), ++ TensorShape({2, 2}), ++ {0, 2}, // begin[1] = 2,但只有 2 列 → 索引 0,1 ++ {0, 1, 2, 3}, ++ data ++ ); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(),"slice cols out of keys range")); ++} ++ ++// gather 索引超出 data 行数 ++TEST_F(KPFusedGatherTest, Invalid_IndexOutOfRangeInData) { ++ std::vector<float> data(2 * 12, 1.0f); ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 12}), ++ TensorShape({2, 2}), ++ {0, 0}, ++ {0, 1, ++ 2, 3}, // 索引 2 超出 data 行数(只有 0,1) ++ data ++ ); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(),"idx out of table range")); ++} ++ ++} +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_padding.cc b/annc/tensorflow/kernels/embedding_fused_padding.cc +new file mode 100644 +index 0000000..a0f14b2 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_padding.cc +@@ -0,0 +1,126 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include <vector> ++ ++#include "tensorflow/core/framework/common_shape_fns.h" ++#include "tensorflow/core/framework/shape_inference.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/op_kernel.h" ++ ++namespace tensorflow { ++ ++using shape_inference::InferenceContext; ++using shape_inference::ShapeHandle; ++ ++class KPFusedEmbeddingPaddingOp : public OpKernel { ++public: ++ explicit KPFusedEmbeddingPaddingOp(OpKernelConstruction* context) : OpKernel(context) { ++ fast_ = (type_string() == "KPFusedEmbeddingPaddingFast"); ++ } ++ ++ void Compute(OpKernelContext* context) override { ++ // Grab the input tensor ++ const Tensor& origin_shape = context->input(0); ++ const Tensor& input = context->input(1); ++ const Tensor& input_rows = context->input(2); ++ const Tensor& reshape_sizes = context->input(3); ++ ++ const Tensor& pack = context->input(4); ++ ++ VLOG(1) << "Input shape: " << input.shape().DebugString(); ++ OP_REQUIRES(context, ++ TensorShapeUtils::IsVector(origin_shape.shape()), ++ errors::InvalidArgument("origin_shape dims must 1D, not ", origin_shape.shape().DebugString()) ++ ); ++ OP_REQUIRES(context, ++ origin_shape.NumElements() == 2, ++ errors::InvalidArgument("origin_shape NumElements must == 2, not ", origin_shape.NumElements()) ++ ); ++ OP_REQUIRES(context, ++ TensorShapeUtils::IsMatrix(input.shape()), ++ errors::InvalidArgument("input dims must 2D, not ", input.shape().DebugString())); ++ OP_REQUIRES(context, ++ TensorShapeUtils::IsScalar(input_rows.shape()), ++ errors::InvalidArgument("input_rows must be a scalar") ++ ); ++ OP_REQUIRES(context, ++ TensorShapeUtils::IsVector(reshape_sizes.shape()), ++ errors::InvalidArgument("sizes input must be 1-D, not ", reshape_sizes.shape().DebugString()) ++ ); ++ OP_REQUIRES(context, ++ reshape_sizes.NumElements() == 2, ++ errors::InvalidArgument("reshape_sizes NumElements must == 2")); ++ ++ int input_rows_value = input_rows.scalar<int32>()(); ++ int padding_rows = static_cast<int32>(origin_shape.flat<int64>()(0)) - input_rows_value; ++ auto reshape_cols = reshape_sizes.flat<int32>()(1); ++ int output_rows = padding_rows + input.dim_size(0); ++ int output_cols = input.dim_size(1); ++ OP_REQUIRES(context, ++ padding_rows >= 0, ++ errors::InvalidArgument("Pooling size(", input_rows_value, ++ ") is greater than Input size(", static_cast<int32>(origin_shape.flat<int64>()(0)), ")")); ++ OP_REQUIRES(context, ++ reshape_cols > 0, ++ errors::InvalidArgument("reshape_cols must > 0")); ++ OP_REQUIRES(context, ++ reshape_sizes.flat<int32>()(0) == -1, ++ errors::InvalidArgument("reshape[0] is not -1")); ++ OP_REQUIRES(context, ++ pack.scalar<int32>()() == output_cols, ++ errors::InvalidArgument("pack(", pack.scalar<int32>()(), ") is not equal to embedding dims")); ++ ++ Tensor* output0 = nullptr; ++ Tensor* output1 = nullptr; ++ OP_REQUIRES_OK(context, ++ context->allocate_output(0, TensorShape({}), ++ &output0)); ++ output0->scalar<int32>()() = padding_rows; ++ OP_REQUIRES(context, ++ output_rows * output_cols % reshape_cols == 0, ++ errors::InvalidArgument("padding cannot reshape to [-1, ", reshape_cols, "]") ++ ); ++ int reshape_rows = output_rows * output_cols / reshape_cols; ++ if (fast_) { ++ OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({}), &output1)); ++ output1->scalar<int32>()() = reshape_rows; ++ return; ++ } ++ ++ TensorShape reshaped_shape({reshape_rows, reshape_cols}); ++ OP_REQUIRES_OK(context, ++ context->allocate_output(1, reshaped_shape, &output1)); ++ float* output_data = output1->flat<float>().data(); ++ const float* input_data = input.flat<float>().data(); ++ std::memcpy(output_data, input_data, input_rows_value * output_cols * sizeof(float)); ++ std::memset(output_data + input_rows_value * output_cols, ++ 0.0f, ++ padding_rows * output_cols * sizeof(float)); ++ } ++ ++private: ++ bool fast_; ++}; ++ ++ ++REGISTER_KERNEL_BUILDER(Name("KPFusedEmbeddingPadding").Device(DEVICE_CPU), ++ KPFusedEmbeddingPaddingOp); ++ ++REGISTER_KERNEL_BUILDER(Name("KPFusedEmbeddingPaddingFast").Device(DEVICE_CPU), ++ KPFusedEmbeddingPaddingOp); ++ ++} +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_padding_test.cc b/annc/tensorflow/kernels/embedding_fused_padding_test.cc +new file mode 100644 +index 0000000..5137d51 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_padding_test.cc +@@ -0,0 +1,307 @@ ++/* Copyright 2015 The TensorFlow Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" ++#include "tensorflow/core/framework/allocator.h" ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/types.pb.h" ++#include "tensorflow/core/graph/testlib.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/kernels/ops_util.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++#include "tensorflow/core/lib/gtl/array_slice.h" ++#include "tensorflow/core/lib/random/simple_philox.h" ++#include "tensorflow/core/lib/strings/str_util.h" ++#include "tensorflow/core/platform/test.h" ++#include "tensorflow/core/platform/test_benchmark.h" ++ ++namespace tensorflow { ++ ++class KPFusedEmbeddingPaddingTest : public OpsTestBase { ++ protected: ++ void MakeOp(DataType input_shape_type, DataType pooling_type, DataType reshape_type, DataType const_type) { ++ TF_ASSERT_OK(NodeDefBuilder("fused_padding", "KPFusedEmbeddingPadding") ++ .Input(FakeInput(input_shape_type)) ++ .Input(FakeInput(pooling_type)) ++ .Input(FakeInput(const_type)) ++ .Input(FakeInput(reshape_type)) ++ .Input(FakeInput(const_type)) ++ .Finalize(node_def())); ++ TF_ASSERT_OK(InitOp()); ++ } ++ ++ Status FeedAndRun(const int embedding_dims, const int table_size, ++ const int pooling_size, const int reshape_size) { ++ MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32); ++ AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims}); ++ AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { ++ return static_cast<float>(i + 1); ++ }); ++ AddInputFromArray<int32>(TensorShape({}), {pooling_size}); ++ AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size}); ++ AddInputFromArray<int32>(TensorShape({}), {embedding_dims}); ++ return RunOpKernel(); ++ } ++ ++ void MakeFastOp(DataType input_shape_type, DataType pooling_type, DataType reshape_type, DataType const_type) { ++ TF_ASSERT_OK(NodeDefBuilder("fused_padding_fast", "KPFusedEmbeddingPaddingFast") ++ .Input(FakeInput(input_shape_type)) ++ .Input(FakeInput(pooling_type)) ++ .Input(FakeInput(const_type)) ++ .Input(FakeInput(reshape_type)) ++ .Input(FakeInput(const_type)) ++ .Finalize(node_def())); ++ TF_ASSERT_OK(InitOp()); ++ } ++ ++ Status FeedAndRunFast(const int embedding_dims, const int table_size, ++ const int pooling_size, const int reshape_size) { ++ MakeFastOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32); ++ AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims}); ++ AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { ++ return static_cast<float>(i + 1); ++ }); ++ AddInputFromArray<int32>(TensorShape({}), {pooling_size}); ++ AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size}); ++ AddInputFromArray<int32>(TensorShape({}), {embedding_dims}); ++ return RunOpKernel(); ++ } ++}; ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims10_0) { ++ // Feed and run ++ const int embedding_dims = 10; ++ const int table_size = 151; ++ const int pooling_size = 151; ++ const int reshape_size = 1510; ++ TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillFn<float>(&expected2, [=](int i) -> float { ++ if (i < pooling_size * embedding_dims) { ++ return static_cast<float>(i + 1); ++ } else { ++ return 0.0f; ++ } ++ }); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims10_1) { ++ // Feed and run ++ const int embedding_dims = 10; ++ const int table_size = 1510; ++ const int pooling_size = 151; ++ const int reshape_size = 1510; ++ TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillFn<float>(&expected2, [=](int i) -> float { ++ if (i < pooling_size * embedding_dims) { ++ return static_cast<float>(i + 1); ++ } else { ++ return 0.0f; ++ } ++ }); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims12_0) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 2; ++ const int pooling_size = 2; ++ const int reshape_size = 24; ++ TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillFn<float>(&expected2, [=](int i) -> float { ++ if (i < pooling_size * embedding_dims) { ++ return static_cast<float>(i + 1); ++ } else { ++ return 0.0f; ++ } ++ }); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithEmbeddingDims12_1) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 200; ++ const int pooling_size = 2; ++ const int reshape_size = 24; ++ TF_ASSERT_OK(FeedAndRun(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_FLOAT, TensorShape({table_size * embedding_dims / reshape_size, reshape_size})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillFn<float>(&expected2, [=](int i) -> float { ++ if (i < pooling_size * embedding_dims) { ++ return static_cast<float>(i + 1); ++ } else { ++ return 0.0f; ++ } ++ }); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorNear<float>(expected2, *GetOutput(1), 1e-5); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims10_0) { ++ // Feed and run ++ const int embedding_dims = 10; ++ const int table_size = 151; ++ const int pooling_size = 151; ++ const int reshape_size = 1510; ++ TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_INT32, TensorShape({})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size}); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorEqual<int32>(expected2, *GetOutput(1)); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims10_1) { ++ // Feed and run ++ const int embedding_dims = 10; ++ const int table_size = 1510; ++ const int pooling_size = 151; ++ const int reshape_size = 1510; ++ TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_INT32, TensorShape({})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size}); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorEqual<int32>(expected2, *GetOutput(1)); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims12_0) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 2; ++ const int pooling_size = 2; ++ const int reshape_size = 24; ++ TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_INT32, TensorShape({})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size}); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorEqual<int32>(expected2, *GetOutput(1)); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingFastWithEmbeddingDims12_1) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 200; ++ const int pooling_size = 2; ++ const int reshape_size = 24; ++ TF_ASSERT_OK(FeedAndRunFast(embedding_dims, table_size, pooling_size, reshape_size)); ++ ++ // Check the output. ++ Tensor expected1(allocator(), DT_INT32, TensorShape({})); ++ Tensor expected2(allocator(), DT_INT32, TensorShape({})); ++ test::FillValues<int32>(&expected1, {table_size - pooling_size}); ++ test::FillValues<int32>(&expected2, {table_size * embedding_dims / reshape_size}); ++ test::ExpectTensorEqual<int32>(expected1, *GetOutput(0)); ++ test::ExpectTensorEqual<int32>(expected2, *GetOutput(1)); ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithUnexpectReshape) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 200; ++ const int pooling_size = 2; ++ const int reshape_size = 24; ++ MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32); ++ AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims}); ++ AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { ++ return static_cast<float>(i + 1); ++ }); ++ AddInputFromArray<int32>(TensorShape({}), {pooling_size}); ++ AddInputFromArray<int32>(TensorShape({2}), {10, reshape_size}); ++ AddInputFromArray<int32>(TensorShape({}), {embedding_dims}); ++ Status s = RunOpKernel(); ++ EXPECT_TRUE( ++ absl::StrContains(s.ToString(), "reshape[0] is not -1")) ++ << s; ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithUnexpectPack) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 200; ++ const int pooling_size = 2; ++ const int reshape_size = 24; ++ MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32); ++ AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims}); ++ AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { ++ return static_cast<float>(i + 1); ++ }); ++ AddInputFromArray<int32>(TensorShape({}), {pooling_size}); ++ AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size}); ++ AddInputFromArray<int32>(TensorShape({}), {10}); ++ Status s = RunOpKernel(); ++ EXPECT_TRUE( ++ absl::StrContains(s.ToString(), "pack(10) is not equal to embedding dims")) ++ << s; ++} ++ ++TEST_F(KPFusedEmbeddingPaddingTest, FusedPaddingWithPoolingSizeGreaterInput) { ++ // Feed and run ++ const int embedding_dims = 12; ++ const int table_size = 200; ++ const int pooling_size = 201; ++ const int reshape_size = 24; ++ MakeOp(DT_INT64, DT_FLOAT, DT_INT32, DT_INT32); ++ AddInputFromArray<int64>(TensorShape({2}), {table_size, embedding_dims}); ++ AddInput<float>(TensorShape({pooling_size, embedding_dims}), [](int i) -> float { ++ return static_cast<float>(i + 1); ++ }); ++ AddInputFromArray<int32>(TensorShape({}), {pooling_size}); ++ AddInputFromArray<int32>(TensorShape({2}), {-1, reshape_size}); ++ AddInputFromArray<int32>(TensorShape({}), {embedding_dims}); ++ Status s = RunOpKernel(); ++ EXPECT_TRUE( ++ absl::StrContains(s.ToString(), "Pooling size(201) is greater than Input size(200)")) ++ << s; ++} ++ ++} // end namespace tensorflow +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch.cc b/annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch.cc +new file mode 100644 +index 0000000..0ccf599 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch.cc +@@ -0,0 +1,87 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include <vector> ++ ++#include "tensorflow/core/framework/common_shape_fns.h" ++#include "tensorflow/core/framework/op.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/util/work_sharder.h" ++ ++using namespace tensorflow; ++ ++class KPFusedSparseDynamicStitchOp : public OpKernel { ++public: ++ explicit KPFusedSparseDynamicStitchOp(OpKernelConstruction* context) ++ : OpKernel(context) {} ++ ++ void Compute(OpKernelContext* context) override { ++ const Tensor& x = context->input(0); ++ auto x_flat = x.flat<int64>(); ++ int64_t num_elems = x_flat.size(); ++ ++ const int num_inputs = context->num_inputs(); ++ const int num_partitions = num_inputs - 1; ++ OP_REQUIRES(context, num_partitions > 1, errors::InvalidArgument("num partitions must > 1")); ++ int64_t output_stride = 0; ++ std::vector<const float*> variables(num_partitions); ++ std::vector<int64_t> variable_rows(num_partitions); ++ for (int i = 1; i < num_inputs; ++i) { ++ const Tensor& input_tensor = context->input(i); ++ OP_REQUIRES(context, input_tensor.dims() == 2, errors::InvalidArgument("input dims must == 2")); ++ if (i == 1) { ++ output_stride = input_tensor.dim_size(1); ++ } else { ++ OP_REQUIRES(context, input_tensor.dim_size(1) == output_stride, ++ errors::InvalidArgument("All inputs must have same second dimension")); ++ } ++ variables[i - 1] = context->input(i).flat<float>().data(); ++ variable_rows[i - 1] = input_tensor.dim_size(0); ++ } ++ ++ OP_REQUIRES(context, output_stride > 0, errors::InvalidArgument("output_stride must > 0")); ++ ++ Tensor* output_tensor = nullptr; ++ OP_REQUIRES_OK(context, ++ context->allocate_output(0, TensorShape({num_elems, output_stride}), ++ &output_tensor)); ++ float* output = (float*)output_tensor->tensor_data().data(); ++ ++ const size_t copy_size = output_stride * sizeof(float); ++ ++ auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); ++ const int64 cost_per_unit = 120; // Actual single cycle execution time ++ auto work = [&](int start, int end) { ++ for (int i = start; i < end; ++i) { ++ const int64_t global_id = x_flat(i); ++ const int64_t table_id = global_id % num_partitions; ++ const int64_t row_id = global_id / num_partitions; ++ ++ OP_REQUIRES(context, row_id < variable_rows[table_id], errors::InvalidArgument( ++ "row_id out of range.")); ++ ++ std::memcpy(output + i * output_stride, ++ variables[table_id] + row_id * output_stride, copy_size); ++ } ++ }; ++ ++ Shard(worker_threads->num_threads, worker_threads->workers, num_elems, ++ cost_per_unit, work); ++ } ++}; ++ ++REGISTER_KERNEL_BUILDER(Name("KPFusedSparseDynamicStitch").Device(DEVICE_CPU), ++ KPFusedSparseDynamicStitchOp); +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch_test.cc b/annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch_test.cc +new file mode 100644 +index 0000000..74fdd25 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_dynamic_stitch_test.cc +@@ -0,0 +1,108 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ * ==============================================================================*/ ++ ++#include <functional> ++#include <memory> ++#include <vector> ++ ++#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" ++#include "tensorflow/core/framework/allocator.h" ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/types.pb.h" ++#include "tensorflow/core/graph/testlib.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/kernels/ops_util.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++#include "tensorflow/core/lib/gtl/array_slice.h" ++#include "tensorflow/core/lib/random/simple_philox.h" ++#include "tensorflow/core/lib/strings/str_util.h" ++#include "tensorflow/core/platform/test.h" ++#include "tensorflow/core/platform/test_benchmark.h" ++ ++namespace tensorflow { ++namespace { ++ ++class KPFusedSparseDynamicStitchOpTest : public OpsTestBase { ++ protected: ++ void MakeOp(int N) { ++ TF_ASSERT_OK(NodeDefBuilder("kp_fused_sparse_dynamic_stitch", ++ "KPFusedSparseDynamicStitch") ++ .Input(FakeInput(DT_INT64)) ++ .Input(FakeInput(N, DT_FLOAT)) ++ .Finalize(node_def())); ++ TF_ASSERT_OK(InitOp()); ++ } ++}; ++ ++TEST_F(KPFusedSparseDynamicStitchOpTest, TestTwoTables) { ++ MakeOp(2); // num_partitions = 2 ++ ++ AddInputFromArray<int64>(TensorShape({4}), {0, 3, 2, 1}); ++ AddInputFromArray<float>(TensorShape({3, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); ++ AddInputFromArray<float>(TensorShape({2, 2}), {7.0f, 8.0f, 9.0f, 10.0f}); ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 2})); ++ test::FillValues<float>(&expected, ++ {1.0f, 2.0f, 9.0f, 10.0f, 3.0f, 4.0f, 7.0f, 8.0f}); ++ test::ExpectTensorEqual<float>(expected, *GetOutput(0)); ++} ++ ++TEST_F(KPFusedSparseDynamicStitchOpTest, TestDifferentStride) { ++ MakeOp(2); ++ ++ AddInputFromArray<int64>(TensorShape({4}), {0, 3, 2, 1}); ++ AddInputFromArray<float>(TensorShape({3, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); ++ AddInputFromArray<float>(TensorShape({1, 4}), {7.0f, 8.0f, 9.0f, 10.0f}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(),"All inputs must have same second dimension")); ++} ++ ++TEST_F(KPFusedSparseDynamicStitchOpTest, TestIndicesOutOfBounds) { ++ MakeOp(2); ++ ++ AddInputFromArray<int64>(TensorShape({4}), {0, 6, 2, 1}); ++ AddInputFromArray<float>(TensorShape({3, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); ++ AddInputFromArray<float>(TensorShape({2, 2}), {7.0f, 8.0f, 9.0f, 10.0f}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(),"row_id out of range")); ++} ++ ++TEST_F(KPFusedSparseDynamicStitchOpTest, TestInputDims) { ++ MakeOp(2); ++ ++ AddInputFromArray<int64>(TensorShape({4}), {0, 6, 2, 1}); ++ AddInputFromArray<float>(TensorShape({3, 2, 1}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f}); ++ AddInputFromArray<float>(TensorShape({2, 2, 1}), {7.0f, 8.0f, 9.0f, 10.0f}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(),"input dims must == 2")); ++} ++ ++} // namespace ++} // namespace tensorflow +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_reshape.cc b/annc/tensorflow/kernels/embedding_fused_sparse_reshape.cc +new file mode 100644 +index 0000000..e4acad2 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_reshape.cc +@@ -0,0 +1,195 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved.
++
++Licensed under the Apache License, Version 2.0 (the "License");
++you may not use this file except in compliance with the License.
++You may obtain a copy of the License at
++
++ http://www.apache.org/licenses/LICENSE-2.0
++
++Unless required by applicable law or agreed to in writing, software
++distributed under the License is distributed on an "AS IS" BASIS,
++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
++See the License for the specific language governing permissions and
++limitations under the License.
++==============================================================================*/
++
++#include "tensorflow/core/framework/common_shape_fns.h"
++#include "tensorflow/core/framework/op.h"
++#include "tensorflow/core/framework/op_kernel.h"
++#include "tensorflow/core/framework/tensor.h"
++#include "tensorflow/core/util/work_sharder.h"
++#include "tensorflow/core/kernels/reshape_util.h"
++#include "tensorflow/core/framework/register_types.h"
++#include "tensorflow/core/framework/tensor_util.h"
++#include "tensorflow/core/framework/types.h"
++#include "tensorflow/core/lib/gtl/inlined_vector.h"
++
++using namespace tensorflow;
++
++static void ReshapeKp(OpKernelContext *context, const Tensor &input_indices_in,
++ const Tensor &input_shape_in, const Tensor &target_shape_in,
++ int output_indices_idx, int output_shape_idx) {
++ OP_REQUIRES(context, TensorShapeUtils::IsMatrix(input_indices_in.shape()),
++ errors::InvalidArgument(
++ "Input indices should be a matrix but received shape ",
++ input_indices_in.shape().DebugString()));
++ OP_REQUIRES(context, TensorShapeUtils::IsVector(input_shape_in.shape()),
++ errors::InvalidArgument(
++ "Input shape should be a vector but received shape ",
++ input_shape_in.shape().DebugString()));
++ OP_REQUIRES(context, TensorShapeUtils::IsVector(target_shape_in.shape()),
++ errors::InvalidArgument(
++ "Target shape should be a vector but received shape ",
++ target_shape_in.shape().DebugString()));
++
++ const int64 input_rank = input_shape_in.NumElements();
++ const int64 output_rank = target_shape_in.NumElements();
++ const TensorShape input_shape(input_shape_in.vec<int64>());
++ const int64 dense_size = input_shape.num_elements();
++ const int64 nnz = input_indices_in.shape().dim_size(0);
++
++ TensorShape output_shape;
++ int64 product = 1;
++ int unknown_index = -1;
++ auto target_shape = target_shape_in.vec<int64>();
++ for (int d = 0; d < output_rank; ++d) {
++ const int64 size = target_shape(d);
++ if (size == -1) {
++ OP_REQUIRES(
++ context, unknown_index == -1,
++ errors::InvalidArgument("only one output dimension may be -1, "
++ "not both ",
++ unknown_index, " and ", d));
++ unknown_index = d;
++ output_shape.AddDim(1);
++ } else {
++ OP_REQUIRES(context, size >= 0,
++ errors::InvalidArgument("size ", d,
++ " must be non-negative, not ", size));
++ product *= size;
++ output_shape.AddDim(size);
++ }
++ }
++ if (unknown_index != -1) {
++ OP_REQUIRES(
++ context, product > 0,
++ errors::InvalidArgument("reshape cannot infer the missing "
++ "input size for an empty tensor unless all "
++ "specified input sizes are non-zero"));
++ const int64 missing = dense_size / product;
++ OP_REQUIRES(
++ context, product * missing == dense_size,
++ errors::InvalidArgument(
++ "Input to reshape is a SparseTensor with ", dense_size,
++ " dense values, but the requested shape requires a multiple of ",
++ product, ". input_shape=", input_shape.DebugString(),
++ " output_shape=", output_shape.DebugString()));
++ output_shape.set_dim(unknown_index, missing);
++ }
++
++ OP_REQUIRES(
++ context, output_shape.num_elements() == dense_size,
++ errors::InvalidArgument("Input to reshape is a tensor with ", dense_size,
++ " dense values, but the requested shape has ",
++ output_shape.num_elements(),
++ ". input_shape=", input_shape.DebugString(),
++ " output_shape=", output_shape.DebugString()));
++
++ if (input_shape == output_shape) {
++ context->set_output(output_indices_idx, input_indices_in);
++ context->set_output(output_shape_idx, input_shape_in);
++ return;
++ }
++
++ gtl::InlinedVector<int64, 8> input_strides(input_rank);
++ if (input_rank > 0) {
++ input_strides[input_rank - 1] = 1;
++ for (int d = input_rank - 2; d >= 0; --d) {
++ input_strides[d] = input_strides[d + 1] * input_shape.dim_size(d + 1);
++ }
++ }
++
++ gtl::InlinedVector<int64, 8> output_strides(output_rank);
++ if (output_rank > 0) {
++ output_strides[output_rank - 1] = 1;
++ for (int d = output_rank - 2; d >= 0; --d) {
++ output_strides[d] = output_strides[d + 1] * output_shape.dim_size(d + 1);
++ }
++ }
++
++ Tensor *result_indices = nullptr;
++ OP_REQUIRES_OK(context,
++ context->allocate_output(output_indices_idx,
++ TensorShape({nnz, output_rank}),
++ &result_indices));
++ auto input_ind = input_indices_in.matrix<int64>();
++ auto output_ind = result_indices->matrix<int64>();
++ for (int i = 0; i < nnz; ++i) {
++ int64 id = 0;
++ for (int j = 0; j < input_rank; ++j) {
++ id += input_ind(i, j) * input_strides[j];
++ }
++ for (int j = 0; j < output_rank; ++j) {
++ output_ind(i, j) = id / output_strides[j];
++ id %= output_strides[j];
++ }
++ }
++
++ Tensor *result_shape = nullptr;
++ OP_REQUIRES_OK(context, context->allocate_output(output_shape_idx,
++ TensorShape({output_rank}),
++ &result_shape));
++ auto output_shape_vec = result_shape->vec<int64>();
++ for (int j = 0; j < output_shape.dims(); ++j) {
++ output_shape_vec(j) = output_shape.dim_size(j);
++ }
++}
++
++class KPFusedSparseReshapeOp : public OpKernel {
++ public:
++ explicit KPFusedSparseReshapeOp(OpKernelConstruction* context) : OpKernel(context) { }
++
++ void Compute(OpKernelContext* context) override {
++ const Tensor& slice_input = context->input(0);
++ const Tensor& begin = context->input(1);
++ const Tensor& new_shape = context->input(2);
++ const Tensor& pack_const = context->input(3);
++
++ OP_REQUIRES(context, slice_input.dims() == 2, errors::Internal("slice_input dims must == 2"));
++ OP_REQUIRES(context, new_shape.dim_size(0) == 2, errors::Internal("new_shape dim size must == 2"));
++ OP_REQUIRES(context, pack_const.dims() == 0,
++ errors::InvalidArgument("pack_const must be a scalar"));
++ VLOG(1) << "Input slice_input shape: " << slice_input.shape().DebugString();
++ VLOG(1) << "Input begin value: " << begin.DebugString();
++ VLOG(1) << "Input new_shape value: " << new_shape.DebugString();
++
++ OP_REQUIRES(context, begin.dims() == 1 && begin.dim_size(0) == 2,
++ errors::InvalidArgument("begin must be 1D with at least 2 elements"));
++ int32 col = begin.flat<int32>().data()[1];
++ OP_REQUIRES(context, col < slice_input.dim_size(1), errors::Internal("begin[1] must < slice_input.dim_size(1)"));
++ int64_t num_rows = slice_input.dim_size(0);
++ auto slice_input_mat = slice_input.matrix<int64>();
++
++ VLOG(1) << "num_rows: " << num_rows;
++ VLOG(1) << "slice_input.dim_size(0): " << slice_input.dim_size(0);
++ VLOG(1) << "slice_input.dim_size(1): " << slice_input.dim_size(1);
++ VLOG(1) << "Column index from begin: " << col;
++
++ Tensor shape_in(DT_INT64, TensorShape({2}));
++ auto tensor_flat = shape_in.flat<int64>();
++ tensor_flat(0) = num_rows;
++ tensor_flat(1) = pack_const.scalar<int64>()();
++
++ Tensor indices_in(DT_INT64, TensorShape({num_rows, 2}));
++ auto indices_in_mat = indices_in.matrix<int64>();
++ for (int i = 0; i < num_rows; ++i) {
++ indices_in_mat(i, 0) = i;
++ indices_in_mat(i, 1) = slice_input_mat(i, col);
++ }
++
++ ReshapeKp(context, indices_in, shape_in, new_shape, 0, 1);
++ }
++};
++
++REGISTER_KERNEL_BUILDER(Name("KPFusedSparseReshape").Device(DEVICE_CPU),
++ KPFusedSparseReshapeOp); +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_reshape_test.cc b/annc/tensorflow/kernels/embedding_fused_sparse_reshape_test.cc +new file mode 100644 +index 0000000..874d48a +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_reshape_test.cc +@@ -0,0 +1,281 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/tensor_testutil.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++ ++namespace { ++using tensorflow::AllocatorAttributes; ++using tensorflow::DT_FLOAT; ++using tensorflow::DT_INT32; ++using tensorflow::DT_INT64; ++using tensorflow::int64; ++using tensorflow::int32; ++using tensorflow::NodeDefBuilder; ++using tensorflow::OpsTestBase; ++using tensorflow::Status; ++using tensorflow::Tensor; ++using tensorflow::TensorShape; ++using tensorflow::test::FillValues; ++using tensorflow::test::ExpectTensorEqual; ++ ++class KPFusedSparseReshapeTest : public OpsTestBase { ++ protected: ++ void RunValidCase(const TensorShape& slice_shape, ++ const std::vector<int64>& slice_data, ++ const std::vector<int32>& begin_val, ++ const std::vector<int64>& new_shape_val, ++ const std::vector<int64>& pack_const_val, ++ const TensorShape& expected_indices_shape, ++ const std::vector<int64>& expected_shape_val) { ++ TF_EXPECT_OK(NodeDefBuilder("kp_fused_sparse_reshape", "KPFusedSparseReshape") ++ .Input(FakeInput(DT_INT64)) // slice_input ++ .Input(FakeInput(DT_INT32)) // begin ++ .Input(FakeInput(DT_INT64)) // new_shape ++ .Input(FakeInput(DT_INT64)) // pack_const ++ .Finalize(node_def())); ++ TF_EXPECT_OK(InitOp()); ++ ++ AddInputFromArray<int64>(slice_shape, slice_data); ++ AddInputFromArray<int32>(TensorShape({2}), begin_val); ++ AddInputFromArray<int64>(TensorShape({2}), new_shape_val); ++ AddInputFromArray<int64>(TensorShape({}), pack_const_val); ++ ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ // 输出0: result_indices ++ const Tensor& out_indices = *GetOutput(0); ++ EXPECT_EQ(out_indices.shape(), expected_indices_shape); ++ ++ // 输出1: result_shape ++ const Tensor& out_shape = *GetOutput(1); ++ Tensor expected_shape_tensor(DT_INT64, ++ TensorShape({static_cast<int64>(expected_shape_val.size())})); ++ FillValues<int64>(&expected_shape_tensor, expected_shape_val); ++ ExpectTensorEqual<int64>(expected_shape_tensor, out_shape); ++ } ++ ++ Status RunOpExpectFailure(const TensorShape& slice_shape, ++ const std::vector<int64>& slice_data, ++ const std::vector<int32>& begin_val, ++ const std::vector<int64>& new_shape_val, ++ const std::vector<int64>& pack_const_val) { ++ TF_CHECK_OK(NodeDefBuilder("kp_fused_sparse_reshape", "KPFusedSparseReshape") ++ .Input(FakeInput(DT_INT64)) // slice_input ++ .Input(FakeInput(DT_INT32)) // begin ++ .Input(FakeInput(DT_INT64)) // new_shape ++ .Input(FakeInput(DT_INT64)) // pack_const ++ .Finalize(node_def())); ++ TF_CHECK_OK(InitOp()); ++ ++ AddInputFromArray<int64>(slice_shape, slice_data); ++ AddInputFromArray<int32>(TensorShape({static_cast<int64>(begin_val.size())}), begin_val); ++ AddInputFromArray<int64>(TensorShape({static_cast<int64>(new_shape_val.size())}), new_shape_val); ++ AddInputFromArray<int64>(TensorShape({}), pack_const_val); ++ ++ return RunOpKernel(); ++ } ++}; ++ ++// ==================== 正向测试 ==================== ++ ++// 正常 reshape 案例 ++// pack_const=2 ++TEST_F(KPFusedSparseReshapeTest, Valid_NormalInput) { ++ RunValidCase( ++ TensorShape({4, 2}), // slice_input shape ++ {0, 1, ++ 1, 2, ++ 2, 3, ++ 3, 0}, // slice_input 数据 ++ {0, 1}, // begin = (0,1),选第1列 ++ {2, 4}, // new_shape = [2,4] ++ {2}, // pack_const = [2] ++ TensorShape({4, 2}), // 预期 indices 形状 ++ {2, 4}); // 预期 shape ++} ++ ++// pack_const = 1 ++TEST_F(KPFusedSparseReshapeTest, Valid_PackConst1) { ++ RunValidCase( ++ TensorShape({1, 2}), // slice_input shape ++ {0, 1}, // slice_input 数据 ++ {0, 1}, // begin = (0,1),选第1列 ++ {-1, 1}, // new_shape = [-1,1] ++ {1}, // pack_const = [1] ++ TensorShape({1, 2}), // 预期 indices 形状 ++ {1, 1}); // 预期 shape ++} ++ ++// ==================== 反向测试 ==================== ++ ++// 反例1:slice_input 不是二维 ++TEST_F(KPFusedSparseReshapeTest, Invalid_SliceInputNot2D) { ++ Status s = RunOpExpectFailure( ++ TensorShape({4}), {0, 1, 2, 3}, ++ {0, 0}, ++ {2, 2}, ++ {4}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "slice_input dims must == 2")); ++} ++ ++// 反例2:new_shape dim size 不是 2 ++TEST_F(KPFusedSparseReshapeTest, Invalid_NewShapeNotLen2) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0, 0}, ++ {4, 2, 1}, // new_shape 多了1个元素 ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "new_shape dim size must == 2")); ++} ++ ++// 反例3:begin[1] 超出 slice_input 列数 ++TEST_F(KPFusedSparseReshapeTest, Invalid_BeginOutOfRange) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0, 2}, // 超过列数 ++ {2, 2}, ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "begin[1] must < slice_input.dim_size(1)")); ++} ++ ++// 反例4:target shape 有多个 -1 ++TEST_F(KPFusedSparseReshapeTest, Invalid_MultipleUnknownDims) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0, 1}, ++ {-1, -1}, // 两个 -1 ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "only one output dimension may be -1")); ++} ++ ++// 反例5:reshape 推断维度时,总元素数不能整除,导致无法匹配 --> product * missing != dense_size ++TEST_F(KPFusedSparseReshapeTest, Invalid_InferredShapeDoesNotMatch) { ++ TensorShape input_indices_shape({6, 2}); // 6 个非零元素,rank=2 ++ std::vector<int64> input_indices_data = { ++ 0, 0, ++ 0, 1, ++ 0, 2, ++ 1, 0, ++ 1, 1, ++ 1, 2 ++ }; // 对应 2x3 的 dense tensor ++ ++ std::vector<int32> begin_val = {0, 0}; // 假设的 begin 输入 ++ std::vector<int64> new_shape_val = {-1, 4}; // reshape 到 ?x4 ++ std::vector<int64> pack_const_val = {1}; ++ ++ Status s = RunOpExpectFailure( ++ input_indices_shape, ++ input_indices_data, ++ begin_val, ++ new_shape_val, ++ pack_const_val); ++ ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Input to reshape is a SparseTensor with")); ++} ++ ++// 反例6:reshape 后元素数量不匹配 --> output_shape.num_elements() != dense_size ++TEST_F(KPFusedSparseReshapeTest, Invalid_SizeMismatch) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0, 1}, ++ {3, 3}, // 期望 9 元素,但输入 dense size = 4 ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Input to reshape is a tensor with")); ++} ++ ++// 反例7:target_shape 包含负数但不是 -1 ++TEST_F(KPFusedSparseReshapeTest, Invalid_NegativeDimNotMinusOne) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0, 0}, ++ {2, -2}, // -2 是非法的 ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "size 1 must be non-negative, not -2")) ++ << "Actual error: " << s.message(); ++} ++ ++// 反例8:target_shape 有 -1,但其他维度乘积为 0 ++TEST_F(KPFusedSparseReshapeTest, Invalid_ProductZeroWithUnknownDim) { ++ // dense_size = 0(空 SparseTensor),target_shape = [-1, 0] ++ // product = 0 → 不允许 infer ++ Status s = RunOpExpectFailure( ++ TensorShape({0, 2}), {}, // 空的 slice_input ++ {0, 0}, ++ {-1, 0}, // product = 0 ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "reshape cannot infer the missing input size for an empty tensor")) ++ << "Actual error: " << s.message(); ++} ++ ++// 反例9:begin 是 1D 但长度为 1(不够 2 个元素) ++TEST_F(KPFusedSparseReshapeTest, Invalid_BeginRank1ButSize1) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0}, // begin = [0],长度为 1 ++ {2, 2}, ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "begin must be 1D with at least 2 elements")) ++ << "Actual error: " << s.message(); ++} ++ ++// 反例10:begin 是 1D 但长度为 3(超过 2) ++TEST_F(KPFusedSparseReshapeTest, Invalid_BeginRank1ButSize3) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2, 2}), {0, 1, 1, 0}, ++ {0, 1, 2}, // begin = [0,1,2],长度为 3 ++ {2, 2}, ++ {2}); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "begin must be 1D with at least 2 elements")) ++ << "Actual error: " << s.message(); ++} ++ ++// 反例11:pack_const 是标量(0维) ++TEST_F(KPFusedSparseReshapeTest, Invalid_PackConstIsScalarButExpect1D) { ++ TF_CHECK_OK(NodeDefBuilder("kp_fused_sparse_reshape", "KPFusedSparseReshape") ++ .Input(FakeInput(DT_INT64)) // slice_input ++ .Input(FakeInput(DT_INT32)) // begin ++ .Input(FakeInput(DT_INT64)) // new_shape ++ .Input(FakeInput(DT_INT64)) // pack_const ++ .Finalize(node_def())); ++ TF_CHECK_OK(InitOp()); ++ ++ AddInputFromArray<int64>(TensorShape({2, 2}), {0, 1, 1, 0}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 1}); ++ AddInputFromArray<int64>(TensorShape({2}), {2, 2}); ++ AddInputFromArray<int64>(TensorShape({1}), {1}); // pack_const = 标量 1(0维) ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "pack_const must be a scalar")) ++ << "Actual error: " << s.message(); ++} ++ ++} // namespace +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce.cc b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce.cc +new file mode 100644 +index 0000000..33bbd31 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce.cc +@@ -0,0 +1,165 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include <arm_neon.h> ++ ++#include "tensorflow/core/framework/common_shape_fns.h" ++#include "tensorflow/core/framework/op.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/util/work_sharder.h" ++ ++using namespace tensorflow; ++ ++template <typename Tidx> ++class KPFusedSparseSegmentReduceOp : public OpKernel { ++public: ++ explicit KPFusedSparseSegmentReduceOp(OpKernelConstruction* context) ++ : OpKernel(context) { ++ int combiner_mode; ++ OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_mode)); ++ OP_REQUIRES(context, combiner_mode == 0 || combiner_mode == 1, ++ errors::InvalidArgument("combiner must be 0 or 1")); ++ is_mean_ = (combiner_mode == 1); ++ } ++ ++ void Compute(OpKernelContext* context) override { ++ const Tensor& input_tensor = context->input(0); ++ const Tensor& indices = context->input(1); ++ const Tensor& slice_input = context->input(2); ++ const Tensor& begin = context->input(3); ++ const Tensor& begin_1 = context->input(4); ++ ++ OP_REQUIRES(context, input_tensor.dims() == 2, errors::InvalidArgument("input must be 2-D")); ++ OP_REQUIRES(context, slice_input.dims() == 2, errors::InvalidArgument("slice input must be 2-D")); ++ OP_REQUIRES(context, begin.NumElements() == 2, errors::InvalidArgument("begin must have 2 elements")); ++ OP_REQUIRES(context, begin_1.NumElements() == 1, errors::InvalidArgument("begin_1 must have 1 element")); ++ int64_t num_indices = indices.dim_size(0); ++ int64_t embedding_size = input_tensor.dim_size(1); ++ int32 col = begin.flat<int32>().data()[1]; ++ int32 out_dim = static_cast<int32>(begin_1.flat<int32>()(0)); ++ ++ OP_REQUIRES(context, col >= 0 && col < slice_input.dim_size(1), ++ errors::InvalidArgument("Column index out of range")); ++ OP_REQUIRES(context, num_indices == slice_input.dim_size(0), ++ errors::InvalidArgument("indices and slice_input.dim_zie(0) should have same size")); ++ ++ auto input_data = input_tensor.matrix<float>().data(); ++ auto indices_vec = indices.vec<Tidx>(); ++ auto slice_input_mat = slice_input.matrix<int64>(); ++ ++ // Calculate max segment_id ++ int64 max_seg_id = 0; ++ for (int32 i = 0; i < num_indices; ++i) { ++ int64 seg_id = slice_input_mat(i, col); ++ if (seg_id > max_seg_id) { ++ max_seg_id = seg_id; ++ } ++ } ++ const int64 batch_size = max_seg_id + 1; ++ ++ Tensor* output = nullptr; ++ OP_REQUIRES_OK(context, ++ context->allocate_output( ++ 0, TensorShape({batch_size, embedding_size}), &output)); ++ output->flat<float>().setZero(); ++ Tensor* slice_out = nullptr; ++ OP_REQUIRES_OK(context, ++ context->allocate_output(1, TensorShape({}), &slice_out)); ++ if (out_dim == 0) ++ slice_out->scalar<int32>()() = batch_size; ++ else slice_out->scalar<int32>()() = embedding_size; ++ ++ auto output_data = output->matrix<float>().data(); ++ ++ if (is_mean_) { ++ Tensor counts(DT_INT32, TensorShape({batch_size})); ++ counts.flat<int32>().setZero(); ++ auto counts_vec = counts.flat<int32>(); ++ ++ for (int64 i = 0; i < num_indices; ++i) { ++ const int64 seg_id = slice_input_mat(i, col); ++ const Tidx data_row = indices_vec(i); ++ counts_vec(seg_id) += 1; ++ ++ float* output_row = output_data + seg_id * embedding_size; ++ const float* input_data_row = input_data + data_row * embedding_size; ++ int64 j = 0; ++ for (; j + 3 < embedding_size; j += 4) { ++ float32x4_t out = vld1q_f32(output_row + j); ++ float32x4_t data = vld1q_f32(input_data_row + j); ++ out = vaddq_f32(out, data); ++ vst1q_f32(output_row + j, out); ++ } ++ ++ for (; j < embedding_size; ++j) { ++ output_row[j] += input_data_row[j]; ++ } ++ } ++ ++ for (int64_t seg = 0; seg < batch_size; ++seg) { ++ const int32_t count = counts_vec(seg); ++ if (count > 0) { ++ const float inv_count = 1.0f / static_cast<float>(count); ++ const float32x4_t inv_count_vec = vdupq_n_f32(inv_count); ++ ++ float* row_start = output_data + seg * embedding_size; ++ int64_t j = 0; ++ ++ for (; j + 3 < embedding_size; j += 4) { ++ float32x4_t val = vld1q_f32(row_start + j); ++ val = vmulq_f32(val, inv_count_vec); ++ vst1q_f32(row_start + j, val); ++ } ++ ++ for (; j < embedding_size; ++j) { ++ row_start[j] *= inv_count; ++ } ++ } ++ } ++ } else { ++ for (int64 i = 0; i < num_indices; ++i) { ++ const int64 seg_id = slice_input_mat(i, col); ++ const Tidx data_row = indices_vec(i); ++ ++ float* output_row = output_data + seg_id * embedding_size; ++ const float* input_data_row = input_data + data_row * embedding_size; ++ int64 j = 0; ++ for (; j + 3 < embedding_size; j += 4) { ++ float32x4_t out = vld1q_f32(output_row + j); ++ float32x4_t data = vld1q_f32(input_data_row + j); ++ out = vaddq_f32(out, data); ++ vst1q_f32(output_row + j, out); ++ } ++ ++ for (; j < embedding_size; ++j) { ++ output_row[j] += input_data_row[j]; ++ } ++ } ++ } ++ } ++ ++private: ++ bool is_mean_; ++}; ++ ++#define REGISTER_KERNEL(Tidx) \ ++ REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSegmentReduce") \ ++ .Device(DEVICE_CPU) \ ++ .TypeConstraint<Tidx>("Tidx"), \ ++ KPFusedSparseSegmentReduceOp<Tidx>); ++REGISTER_KERNEL(int64) ++REGISTER_KERNEL(int32) ++#undef REGISTER_KERNEL +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc +new file mode 100644 +index 0000000..5266476 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero.cc +@@ -0,0 +1,159 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include <arm_neon.h> ++ ++#include "tensorflow/core/framework/common_shape_fns.h" ++#include "tensorflow/core/framework/op.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/util/work_sharder.h" ++#include "absl/container/flat_hash_map.h" ++ ++using namespace tensorflow; ++ ++template <typename Tidx> ++class KPFusedSparseSegmentReduceNonzeroOp : public OpKernel { ++public: ++ explicit KPFusedSparseSegmentReduceNonzeroOp(OpKernelConstruction* context) ++ : OpKernel(context) { ++ int combiner_mode; ++ OP_REQUIRES_OK(context, context->GetAttr("combiner", &combiner_mode)); ++ OP_REQUIRES(context, combiner_mode == 0 || combiner_mode == 1, ++ errors::InvalidArgument("combiner must be 0 or 1")); ++ is_mean_ = (combiner_mode == 1); ++ } ++ ++ void Compute(OpKernelContext* context) override { ++ const Tensor& input_tensor = context->input(0); ++ const Tensor& indices = context->input(1); ++ const Tensor& slice_input = context->input(2); ++ const Tensor& begin = context->input(3); ++ ++ OP_REQUIRES(context, input_tensor.dims() == 1, ++ errors::InvalidArgument("Input data must be a vector")); ++ OP_REQUIRES(context, slice_input.dims() == 2, errors::InvalidArgument("slice input must be 2-D")); ++ OP_REQUIRES(context, begin.NumElements() == 2, errors::InvalidArgument("begin must have 2 elements")); ++ ++ int64 num_indices = indices.dim_size(0); ++ int32 col = begin.flat<int32>().data()[1]; ++ ++ OP_REQUIRES(context, col >= 0 && col < slice_input.dim_size(1), ++ errors::InvalidArgument("Column index out of range")); ++ OP_REQUIRES(context, num_indices == slice_input.dim_size(0), ++ errors::InvalidArgument("indices and slice_input.dim_zie(0) should have same size")); ++ ++ auto input_data = input_tensor.flat<float>(); ++ auto indices_vec = indices.vec<Tidx>(); ++ auto slice_input_mat = slice_input.matrix<int64>(); ++ ++ // Calculate max segment_id ++ std::vector<int64> segment_ids(num_indices); ++ int64 max_seg_id = 0; ++ for (int64 i = 0; i < num_indices; ++i) { ++ int64 seg_id = slice_input_mat(i, col); ++ segment_ids[i] = seg_id; ++ if (seg_id > max_seg_id) { ++ max_seg_id = seg_id; ++ } ++ } ++ ++ const int64 batch_size = max_seg_id + 1; ++ ++ Tensor* output_shape = nullptr; ++ OP_REQUIRES_OK( ++ context, context->allocate_output(0, TensorShape({1}), &output_shape)); ++ output_shape->flat<int32>()(0) = static_cast<int32>(batch_size); ++ ++ std::vector<std::pair<int64, float>> results; ++ int64 num_nonzero = 0; ++ absl::flat_hash_map<int64, float> segment_sums; ++ absl::flat_hash_map<int64, int32> segment_counts; ++ std::vector<int64> segment_order; ++ ++ if (is_mean_) { ++ for (int64 i = 0; i < num_indices; ++i) { ++ const int64 seg_id = segment_ids[i]; ++ const Tidx data_row = indices_vec(i); ++ ++ if (segment_sums.find(seg_id) == segment_sums.end()) { ++ segment_order.push_back(seg_id); ++ } ++ ++ segment_sums[seg_id] += input_data(data_row); ++ segment_counts[seg_id] += 1; ++ } ++ ++ for (int64 seg_id : segment_order) { ++ const int32_t count = segment_counts[seg_id]; ++ if (count > 0) { ++ const float inv_count = 1.0f / static_cast<float>(count); ++ float value = segment_sums[seg_id]; ++ if (value != 0) { ++ results.push_back({seg_id, value * inv_count}); ++ num_nonzero++; ++ } ++ } ++ } ++ } else { ++ for (int64 i = 0; i < num_indices; ++i) { ++ const int64 seg_id = segment_ids[i]; ++ const Tidx data_row = indices_vec(i); ++ ++ if (segment_sums.find(seg_id) == segment_sums.end()) { ++ segment_order.push_back(seg_id); ++ } ++ ++ segment_sums[seg_id] += input_data(data_row); ++ } ++ ++ for (int64 seg_id : segment_order) { ++ float value = segment_sums[seg_id]; ++ if (value != 0) { ++ results.push_back({seg_id, value}); ++ num_nonzero++; ++ } ++ } ++ } ++ Tensor* output_indices = nullptr; ++ OP_REQUIRES_OK(context, ++ context->allocate_output(1, TensorShape({num_nonzero, 1}), ++ &output_indices)); ++ auto output_indices_data = output_indices->flat<int32>(); ++ ++ Tensor* output_nonzero = nullptr; ++ OP_REQUIRES_OK(context, ++ context->allocate_output(2, TensorShape({num_nonzero}), ++ &output_nonzero)); ++ auto output_nonzero_data = output_nonzero->flat<float>(); ++ for (int64 i = 0; i < num_nonzero; ++i) { ++ output_indices_data(i) = static_cast<int32>(results[i].first); ++ output_nonzero_data(i) = results[i].second; ++ } ++ ++ } ++ ++ private: ++ bool is_mean_; ++}; ++ ++#define REGISTER_KERNEL(Tidx) \ ++ REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSegmentReduceNonzero") \ ++ .Device(DEVICE_CPU) \ ++ .TypeConstraint<Tidx>("Tidx"), \ ++ KPFusedSparseSegmentReduceNonzeroOp<Tidx>); ++REGISTER_KERNEL(int64) ++REGISTER_KERNEL(int32) ++#undef REGISTER_KERNEL +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc +new file mode 100644 +index 0000000..ac4a5c3 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_nonzero_test.cc +@@ -0,0 +1,183 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ * ==============================================================================*/ ++ ++#include <functional> ++#include <memory> ++#include <vector> ++ ++#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" ++#include "tensorflow/core/framework/allocator.h" ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/types.pb.h" ++#include "tensorflow/core/graph/testlib.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/kernels/ops_util.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++#include "tensorflow/core/lib/gtl/array_slice.h" ++#include "tensorflow/core/lib/random/simple_philox.h" ++#include "tensorflow/core/lib/strings/str_util.h" ++#include "tensorflow/core/platform/test.h" ++#include "tensorflow/core/platform/test_benchmark.h" ++ ++namespace tensorflow { ++namespace { ++ ++class KPFusedSparseSegmentReduceNonzeroOpTest : public OpsTestBase { ++ protected: ++ void MakeOp(int combiner_mode) { ++ TF_ASSERT_OK(NodeDefBuilder("kp_fused_sparse_segment_reduce_nonzero", ++ "KPFusedSparseSegmentReduceNonzero") ++ .Input(FakeInput(DT_FLOAT)) // data ++ .Input(FakeInput(DT_INT32)) // indices ++ .Input(FakeInput(DT_INT64)) // slice_input ++ .Input(FakeInput(DT_INT32)) // begin ++ .Attr("combiner", combiner_mode) ++ .Finalize(node_def())); ++ TF_ASSERT_OK(InitOp()); ++ } ++}; ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestReduceMean) { ++ MakeOp(1); ++ ++ AddInputFromArray<float>(TensorShape({8}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ Tensor expected(allocator(), DT_INT32, TensorShape({1})); ++ test::FillValues<int32>(&expected, {4}); ++ test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); // output_shape ++ ++ Tensor expected_1(allocator(), DT_INT32, TensorShape({2, 1})); ++ test::FillValues<int32>(&expected_1, {2, 3}); ++ test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1)); // output_indices ++ ++ Tensor expected_2(allocator(), DT_FLOAT, TensorShape({2})); ++ test::FillValues<float>(&expected_2, {2, 2}); ++ test::ExpectTensorEqual<float>(expected_2, *GetOutput(2)); // output_nonzero ++} ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestReduceSum) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({8}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ Tensor expected(allocator(), DT_INT32, TensorShape({1})); ++ test::FillValues<int32>(&expected, {4}); ++ test::ExpectTensorEqual<int32>(expected, *GetOutput(0)); // output_shape ++ ++ Tensor expected_1(allocator(), DT_INT32, TensorShape({2, 1})); ++ test::FillValues<int32>(&expected_1, {2, 3}); ++ test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1)); // output_indices ++ ++ Tensor expected_2(allocator(), DT_FLOAT, TensorShape({2})); ++ test::FillValues<float>(&expected_2, {4, 2}); ++ test::ExpectTensorEqual<float>(expected_2, *GetOutput(2)); // output_nonzero ++} ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestInvalidData) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Input data must be a vector") != ++ std::string::npos); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestInvalidSliceinput) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({8}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4, 1}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "slice input must be 2-D") != ++ std::string::npos); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestInvalidbegin) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({8}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "begin must have 2 elements")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestColsOutOfBounds) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({8}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 4}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Column index out of range")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceNonzeroOpTest, TestIndicesOutOfBounds) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({8}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 1}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), ++ "indices and slice_input.dim_zie(0) should have same size")); ++} ++ ++} // namespace ++} // namespace tensorflow +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_test.cc b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_test.cc +new file mode 100644 +index 0000000..558ab85 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_segment_reduce_test.cc +@@ -0,0 +1,205 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ * ++ * Licensed under the Apache License, Version 2.0 (the "License"); ++ * you may not use this file except in compliance with the License. ++ * You may obtain a copy of the License at ++ * ++ * http://www.apache.org/licenses/LICENSE-2.0 ++ * ++ * Unless required by applicable law or agreed to in writing, software ++ * distributed under the License is distributed on an "AS IS" BASIS, ++ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++ * See the License for the specific language governing permissions and ++ * limitations under the License. ++ * ==============================================================================*/ ++ ++#include <functional> ++#include <memory> ++#include <vector> ++ ++#include "tensorflow/core/common_runtime/kernel_benchmark_testlib.h" ++#include "tensorflow/core/framework/allocator.h" ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/types.pb.h" ++#include "tensorflow/core/graph/testlib.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/kernels/ops_util.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++#include "tensorflow/core/lib/gtl/array_slice.h" ++#include "tensorflow/core/lib/random/simple_philox.h" ++#include "tensorflow/core/lib/strings/str_util.h" ++#include "tensorflow/core/platform/test.h" ++#include "tensorflow/core/platform/test_benchmark.h" ++ ++namespace tensorflow { ++namespace { ++ ++class KPFusedSparseSegmentReduceOpTest : public OpsTestBase { ++ protected: ++ void MakeOp(int combiner_mode) { ++ TF_ASSERT_OK(NodeDefBuilder("kp_fused_sparse_segment_reduce", ++ "KPFusedSparseSegmentReduce") ++ .Input(FakeInput(DT_FLOAT)) // data ++ .Input(FakeInput(DT_INT32)) // indices ++ .Input(FakeInput(DT_INT64)) // slice_input ++ .Input(FakeInput(DT_INT32)) // begin ++ .Input(FakeInput(DT_INT32)) // begin_1 ++ .Attr("combiner", combiner_mode) ++ .Finalize(node_def())); ++ TF_ASSERT_OK(InitOp()); ++ } ++}; ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestReduceMean) { ++ MakeOp(1); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int32>(TensorShape({1}), {1}); ++ ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 2})); ++ test::FillValues<float>(&expected, ++ {0.0f, 0.0f, 0.0f, 0.0f, 3.0f, 4.0f, 3.0f, 4.0f}); ++ test::ExpectTensorEqual<float>(expected, *GetOutput(0)); ++ ++ Tensor expected_1(allocator(), DT_INT32, TensorShape({})); ++ test::FillValues<int32>(&expected_1, {2}); ++ test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1)); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestReduceSum) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int32>(TensorShape({1}), {0}); ++ ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ Tensor expected(allocator(), DT_FLOAT, TensorShape({4, 2})); ++ test::FillValues<float>(&expected, ++ {0.0f, 0.0f, 0.0f, 0.0f, 6.0f, 8.0f, 3.0f, 4.0f}); ++ test::ExpectTensorEqual<float>(expected, *GetOutput(0)); ++ ++ Tensor expected_1(allocator(), DT_INT32, TensorShape({})); ++ test::FillValues<int32>(&expected_1, {4}); ++ test::ExpectTensorEqual<int32>(expected_1, *GetOutput(1)); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestColsOutOfBounds) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 5}); ++ AddInputFromArray<int32>(TensorShape({1}), {0}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Column index out of range")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, Test) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({2}), ++ {0, 2}); // num_indices != slice_input.dim_size(0) ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int32>(TensorShape({1}), {0}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), ++ "indices and slice_input.dim_zie(0) should have same size")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidData) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>( ++ TensorShape({4, 2, 1}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); // data.dims() > 2 ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int32>(TensorShape({1}), {0}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "input must be 2-D")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidSliceinput) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>( ++ TensorShape({3, 4, 1}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); // slice_input.dims() > 2 ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int32>(TensorShape({1}), {0}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "slice input must be 2-D")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidBegin) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({3}), ++ {0, 2, 1}); // begin has 3 elements ++ AddInputFromArray<int32>(TensorShape({1}), {0}); ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "begin must have 2 elements")); ++} ++ ++TEST_F(KPFusedSparseSegmentReduceOpTest, TestInvalidBegin1) { ++ MakeOp(0); ++ ++ AddInputFromArray<float>(TensorShape({4, 2}), ++ {1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 6.0f, 7.0f, 8.0f}); ++ AddInputFromArray<int32>(TensorShape({3}), {0, 2, 1}); ++ AddInputFromArray<int64>(TensorShape({3, 4}), ++ {1, 2, 2, 2, 1, 1, 2, 3, 2, 2, 3, 4}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 2}); ++ AddInputFromArray<int32>(TensorShape({2}), {0, 1}); // begin_1 has 2 elements ++ ++ Status s = RunOpKernel(); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "begin_1 must have 1 element")); ++} ++ ++} // namespace ++} // namespace tensorflow +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_select.cc b/annc/tensorflow/kernels/embedding_fused_sparse_select.cc +new file mode 100644 +index 0000000..306a420 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_select.cc +@@ -0,0 +1,111 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include <vector> ++#include <algorithm> ++ ++#include "tensorflow/core/framework/op_kernel.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/util/work_sharder.h" ++#include "third_party/eigen3/unsupported/Eigen/CXX11/Tensor" ++#include "tensorflow/core/platform/logging.h" ++ ++using namespace tensorflow; ++ ++class KPFusedSparseSelect : public OpKernel { ++public: ++ explicit KPFusedSparseSelect(OpKernelConstruction* context) : OpKernel(context) { ++ } ++ ++ void Compute(OpKernelContext* context) override { ++ const Tensor& input_a = context->input(0); ++ const Tensor& input_b = context->input(1); ++ const Tensor& input_c = context->input(2); ++ const Tensor& greater = context->input(3); ++ const Tensor& equal1 = context->input(4); ++ const Tensor& equal2 = context->input(5); ++ const Tensor& equal3 = context->input(6); ++ ++ int32_t equal1_val = equal1.flat<int32_t>()(0); ++ int32_t equal2_val = equal2.flat<int32_t>()(0); ++ int32_t equal3_val = equal3.flat<int32_t>()(0); ++ VLOG(1) << "equal1_val: " << equal1_val; ++ VLOG(1) << "equal2_val: " << equal2_val; ++ VLOG(1) << "equal3_val: " << equal3_val; ++ ++ int32_t greater_val = greater.flat<int32_t>()(0); ++ auto a_flat = input_a.flat<int32_t>(); ++ auto b_flat = input_b.flat<int32_t>(); ++ auto c_flat = input_c.flat<int32_t>(); ++ VLOG(1) << "input_a shape: " << input_a.shape().DebugString(); ++ VLOG(1) << "input_b shape: " << input_b.shape().DebugString(); ++ VLOG(1) << "input_c shape: " << input_c.shape().DebugString(); ++ OP_REQUIRES(context, input_a.NumElements() == input_b.NumElements(), ++ errors::InvalidArgument("Input num elements of a and b must match")); ++ OP_REQUIRES(context, input_a.NumElements() == input_c.NumElements(), ++ errors::InvalidArgument("Input num elements of a and c must match")); ++ auto N = input_a.NumElements(); ++ ++ Eigen::TensorMap<Eigen::Tensor<const int32_t, 2, Eigen::RowMajor>> a_reshaped_tensor(a_flat.data(), N, 1); ++ Eigen::TensorMap<Eigen::Tensor<const int32_t, 2, Eigen::RowMajor>> b_reshaped_tensor(b_flat.data(), N, 1); ++ Eigen::TensorMap<Eigen::Tensor<const int32_t, 2, Eigen::RowMajor>> c_reshaped_tensor(c_flat.data(), N, 1); ++ ++ Tensor* output_x = nullptr; ++ Tensor* output_y = nullptr; ++ Tensor* output_w = nullptr; ++ ++ OP_REQUIRES_OK(context, context->allocate_output(0, TensorShape({N, 1}), &output_x)); ++ OP_REQUIRES_OK(context, context->allocate_output(1, TensorShape({N, 1}), &output_y)); ++ OP_REQUIRES_OK(context, context->allocate_output(2, TensorShape({N, 2}), &output_w)); ++ ++ Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor>> out_x( ++ output_x->flat<float>().data(), ++ output_x->dim_size(0), ++ output_x->dim_size(1) ++ ); ++ ++ Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor>> out_y( ++ output_y->flat<float>().data(), ++ output_y->dim_size(0), ++ output_y->dim_size(1) ++ ); ++ ++ Eigen::TensorMap<Eigen::Tensor<float, 2, Eigen::RowMajor>> out_w( ++ output_w->flat<float>().data(), ++ output_w->dim_size(0), ++ output_w->dim_size(1) ++ ); ++ ++ auto worker_threads = context->device()->tensorflow_cpu_worker_threads(); ++ const int64 cost_per_unit = std::max(N / worker_threads->num_threads, int64(10)); ++ ++ auto work = [&](int64 start, int64 end) { ++ for (int64 i = start; i < end; i++) { ++ // Greater(bool)+Cast.2406(float) --> 1.0f / 0.0f ++ float a_greater = (a_reshaped_tensor(i, 0) > greater_val) ? 1.0f : 0.0f; ++ float res_equal1 = (b_reshaped_tensor(i, 0) == equal1_val) ? 1.0f : a_greater; // Fill.2409-->1.0f ++ float res_equal2 = (b_reshaped_tensor(i, 0) == equal2_val) ? 1.0f : res_equal1; // Fill.2409-->1.0f ++ out_x(i, 0) = a_reshaped_tensor(i, 0); // Reshape.2401 ++ out_y(i, 0) = res_equal2; ++ out_w(i, 0) = res_equal2; // Mul.2419 硬编码 1.0f * input ++ out_w(i, 1) = 1.0f; // select_2427被消除,直接使用Fill.2422-->1.0f ++ } ++ }; ++ Shard(worker_threads->num_threads, worker_threads->workers, N, cost_per_unit, work); ++ } ++}; ++ ++REGISTER_KERNEL_BUILDER(Name("KPFusedSparseSelect").Device(DEVICE_CPU), ++ KPFusedSparseSelect); +\ No newline at end of file +diff --git a/annc/tensorflow/kernels/embedding_fused_sparse_select_test.cc b/annc/tensorflow/kernels/embedding_fused_sparse_select_test.cc +new file mode 100644 +index 0000000..f956415 +--- /dev/null ++++ b/annc/tensorflow/kernels/embedding_fused_sparse_select_test.cc +@@ -0,0 +1,182 @@ ++/* Copyright 2025 The Huawei Technologies Co. Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++ ++#include "tensorflow/core/framework/fake_input.h" ++#include "tensorflow/core/framework/node_def_builder.h" ++#include "tensorflow/core/framework/tensor.h" ++#include "tensorflow/core/framework/tensor_testutil.h" ++#include "tensorflow/core/kernels/ops_testutil.h" ++#include "tensorflow/core/lib/core/status_test_util.h" ++ ++namespace { ++using tensorflow::AllocatorAttributes; ++using tensorflow::DT_FLOAT; ++using tensorflow::DT_INT32; ++using tensorflow::DT_INT64; ++using tensorflow::int64; ++using tensorflow::int32; ++using tensorflow::NodeDefBuilder; ++using tensorflow::OpsTestBase; ++using tensorflow::Status; ++using tensorflow::Tensor; ++using tensorflow::TensorShape; ++using tensorflow::test::ExpectClose; ++using tensorflow::test::FillValues; ++using tensorflow::test::AsTensor; ++using tensorflow::test::ExpectTensorEqual; ++ ++class KPFusedSparseSelectTest : public OpsTestBase { ++ protected: ++ void RunValidCase( ++ const TensorShape& shape, ++ const std::vector<int32>& a_data, ++ const std::vector<int32>& b_data, ++ const std::vector<int32>& c_data, ++ int32_t greater_val, ++ int32_t equal1_val, ++ int32_t equal2_val, ++ const std::vector<float>& expected_y, ++ const std::vector<float>& expected_w_col0) { ++ ++ TF_EXPECT_OK(NodeDefBuilder("kp_fused_sparse_select", "KPFusedSparseSelect") ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) // greater ++ .Input(FakeInput(DT_INT32)) // equal1 ++ .Input(FakeInput(DT_INT32)) // equal2 ++ .Input(FakeInput(DT_INT32)) // equal3 ++ .Finalize(node_def())); ++ TF_EXPECT_OK(InitOp()); ++ ++ AddInputFromArray<int32>(shape, a_data); ++ AddInputFromArray<int32>(shape, b_data); ++ AddInputFromArray<int32>(shape, c_data); ++ AddInputFromArray<int32>(TensorShape({}), {greater_val}); // scalar ++ AddInputFromArray<int32>(TensorShape({}), {equal1_val}); ++ AddInputFromArray<int32>(TensorShape({}), {equal2_val}); ++ AddInputFromArray<int32>(TensorShape({}), {0}); // equal3_val (未使用) ++ ++ TF_ASSERT_OK(RunOpKernel()); ++ ++ const Tensor& out_x = *GetOutput(0); ++ const Tensor& out_y = *GetOutput(1); ++ const Tensor& out_w = *GetOutput(2); ++ ++ int32 Num_elements = expected_y.size(); ++ // 验证 output_x: 就是 input_a ++ std::vector<float> a_data_float(a_data.begin(), a_data.end()); ++ ExpectTensorEqual<float>(out_x, AsTensor<float>(a_data_float, {Num_elements, 1})); ++ ++ // 验证 output_y ++ ExpectTensorEqual<float>(out_y, AsTensor<float>(expected_y, {Num_elements, 1})); ++ // 验证 output_w 第一列 ++ auto w_mat = out_w.matrix<float>(); ++ for (int i = 0; i < w_mat.dimension(0); ++i) { ++ EXPECT_FLOAT_EQ(w_mat(i, 0), expected_w_col0[i]); ++ EXPECT_FLOAT_EQ(w_mat(i, 1), 1.0f); // 第二列必须是 1.0 ++ } ++ } ++ ++ Status RunOpExpectFailure( ++ const TensorShape& shape, ++ const std::vector<int32>& a_data, ++ const std::vector<int32>& b_data, ++ const std::vector<int32>& c_data, ++ int32_t greater_val, ++ int32_t equal1_val, ++ int32_t equal2_val) { ++ ++ TF_CHECK_OK(NodeDefBuilder("kp_fused_sparse_select", "KPFusedSparseSelect") ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Input(FakeInput(DT_INT32)) ++ .Finalize(node_def())); ++ TF_CHECK_OK(InitOp()); ++ TensorShape b_shape({static_cast<int64>(b_data.size())}); ++ TensorShape c_shape({static_cast<int64>(c_data.size())}); ++ AddInputFromArray<int32>(shape, a_data); ++ AddInputFromArray<int32>(b_shape, b_data); ++ AddInputFromArray<int32>(c_shape, c_data); ++ AddInputFromArray<int32>(TensorShape({}), {greater_val}); ++ AddInputFromArray<int32>(TensorShape({}), {equal1_val}); ++ AddInputFromArray<int32>(TensorShape({}), {equal2_val}); ++ AddInputFromArray<int32>(TensorShape({}), {0}); ++ ++ return RunOpKernel(); ++ } ++}; ++ ++// ==================== 正向测试 ==================== ++// 更多正向验证参考 fused_embedding_sparse_select_test.py ++TEST_F(KPFusedSparseSelectTest, Valid_NormalInput) { ++ RunValidCase( ++ TensorShape({3}), // shape ++ {5, 3, 8}, // input_a ++ {1, 2, 1}, // input_b ++ {9, 8, 7}, // input_c (未使用) ++ 4, // greater_val ++ 1, // equal1_val ++ 3, // equal2_val ++ {1.0f, 0.0f, 1.0f}, // expected_y ++ {1.0f, 0.0f, 1.0f} // expected_w_col0 ++ ); ++} ++ ++TEST_F(KPFusedSparseSelectTest, Valid_2DInput) { ++ RunValidCase( ++ TensorShape({2, 2}), ++ {6, 3, 8, 2}, ++ {2, 1, 3, 4}, ++ {0, 0, 0, 0}, ++ 5, ++ 2, ++ 3, ++ {1.0f, 0.0f, 1.0f, 0.0f}, ++ {1.0f, 0.0f, 1.0f, 0.0f} ++ ); ++} ++// ==================== 反向测试 ==================== ++// 反例1:input_a 与 input_b 元素数不匹配 ++TEST_F(KPFusedSparseSelectTest, Invalid_DimMismatch_AB) { ++ Status s = RunOpExpectFailure( ++ TensorShape({3}), // a 有 3 个元素 ++ {1, 2, 3}, ++ {4, 5}, // b 有 2 个元素 → 不匹配! ++ {6, 7, 8}, ++ 0, 1, 2 ++ ); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Input num elements of a and b must match")); ++} ++ ++// 反例2:input_a 与 input_c 元素数不匹配 ++TEST_F(KPFusedSparseSelectTest, Invalid_DimMismatch_AC) { ++ Status s = RunOpExpectFailure( ++ TensorShape({2}), ++ {1, 2}, ++ {3, 4}, ++ {5}, // c 只有 1 个元素 → 不匹配! ++ 0, 1, 2 ++ ); ++ EXPECT_FALSE(s.ok()); ++ EXPECT_TRUE(absl::StrContains(s.message(), "Input num elements of a and c must match")); ++} ++ ++} +\ No newline at end of file +diff --git a/annc/tensorflow/ops/embedding_fused_ops.cc b/annc/tensorflow/ops/embedding_fused_ops.cc +new file mode 100644 +index 0000000..bc0ce24 +--- /dev/null ++++ b/annc/tensorflow/ops/embedding_fused_ops.cc +@@ -0,0 +1,133 @@ ++/* Copyright 2018 The TensorFlow Authors. All Rights Reserved. ++ ++Licensed under the Apache License, Version 2.0 (the "License"); ++you may not use this file except in compliance with the License. ++You may obtain a copy of the License at ++ ++ http://www.apache.org/licenses/LICENSE-2.0 ++ ++Unless required by applicable law or agreed to in writing, software ++distributed under the License is distributed on an "AS IS" BASIS, ++WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++See the License for the specific language governing permissions and ++limitations under the License. ++==============================================================================*/ ++#include <stdio.h> ++ ++#include "tensorflow/core/framework/op.h" ++#include "tensorflow/core/framework/types.h" ++#include "tensorflow/core/framework/shape_inference.h" ++#include "tensorflow/core/framework/common_shape_fns.h" ++ ++namespace tensorflow { ++ ++using shape_inference::DimensionHandle; ++using shape_inference::InferenceContext; ++using shape_inference::ShapeHandle; ++using shape_inference::UnchangedShape; ++ ++REGISTER_OP("KPFusedSparseSegmentReduce") ++ .Input("data: float") ++ .Input("indices: Tidx") ++ .Input("slice_input: int64") ++ .Input("begin: int32") ++ .Input("begin_1: int32") ++ .Attr("combiner: int = 1") // 0 for SUM, 1 for MEAN ++ .Attr("Tidx: {int32, int64} = DT_INT32") ++ .Output("output: float") ++ .Output("slice_output: int32") ++ .SetShapeFn(shape_inference::UnknownShape); ++ ++REGISTER_OP("KPFusedSparseSegmentReduceNonzero") ++ .Input("data: float") ++ .Input("indices: Tidx") ++ .Input("slice_input: int64") ++ .Input("begin: int32") ++ .Attr("combiner: int = 1") // 0 for SUM, 1 for MEAN ++ .Attr("Tidx: {int32, int64} = DT_INT32") ++ .Output("output_shape: int32") ++ .Output("output_indices: int32") ++ .Output("output_nonzero: float") ++ .SetShapeFn(shape_inference::UnknownShape); ++ ++REGISTER_OP("KPFusedEmbeddingPaddingFast") ++ .Input("input0: int64") ++ .Input("input1: float") ++ .Input("input2: int32") ++ .Input("input3: int32") ++ .Input("pack: int32") ++ .Output("output0: int32") ++ .Output("output1: int32") ++ .SetShapeFn([](InferenceContext* c) { ++ ShapeHandle scalar_shape = c->Scalar(); ++ c->set_output(0, scalar_shape); ++ c->set_output(1, scalar_shape); ++ return OkStatus(); ++ }); ++ ++REGISTER_OP("KPFusedEmbeddingPadding") ++ .Input("input0: int64") ++ .Input("input1: float") ++ .Input("input2: int32") ++ .Input("input3: int32") ++ .Input("pack: int32") ++ .Output("output0: int32") ++ .Output("output1: float") ++ .SetShapeFn([](InferenceContext* c) { ++ ShapeHandle out; ++ ShapeHandle scalar_shape = c->Scalar(); ++ TF_RETURN_IF_ERROR(c->MakeShapeFromShapeTensor(3, &out)); ++ c->set_output(0, scalar_shape); ++ c->set_output(1, out); ++ return OkStatus(); ++ }); ++ ++REGISTER_OP("KPFusedSparseSelect") ++ .Input("input_a: int32") ++ .Input("input_b: int32") ++ .Input("input_c: int32") ++ .Input("greater: int32") ++ .Input("equal1: int32") ++ .Input("equal2: int32") ++ .Input("equal3: int32") ++ .Output("output_x: float") ++ .Output("output_y: float") ++ .Output("output_w: float") ++ .SetShapeFn(shape_inference::UnknownShape); ++ ++REGISTER_OP("KPFusedSparseReshape") ++ .Input("slice_input: int64") ++ .Input("begin: int32") ++ .Input("new_shape: int64") ++ .Input("pack_const: int64") ++ .Output("out_indices: int64") ++ .Output("out_shape: int64") ++ .SetShapeFn(shape_inference::UnknownShape); ++ ++REGISTER_OP("KPFusedSparseDynamicStitch") ++ .Input("x: int64") ++ .Input("variables: N * float") ++ .Output("output: float") ++ .Attr("N: int >= 1") ++ .SetShapeFn(shape_inference::UnknownShape); ++ ++REGISTER_OP("KPFusedGather") ++ .Input("data: float") ++ .Input("slice_input: int64") ++ .Input("begin: int32") ++ .Output("out_shape: int64") ++ .Output("out_indices: int32") ++ .Output("out_data: float") ++ .SetShapeFn(shape_inference::UnknownShape); ++ ++REGISTER_OP("KPFusedEmbeddingActionIdGather") ++ .Input("input0: Tindices1") ++ .Input("input1: float") ++ .Input("input2: Tindices2") ++ .Input("input3: int32") ++ .Input("pack: int32") ++ .Attr("Tindices1: {int32, int64} = DT_INT64") ++ .Attr("Tindices2: {int32, int64} = DT_INT32") ++ .Output("output0: float") ++ .SetShapeFn(shape_inference::UnknownShape); ++} // namespace tensorflow +\ No newline at end of file +diff --git a/annc/tensorflow/tf_annc_optimizer.patch b/annc/tensorflow/tf_annc_optimizer.patch +new file mode 100644 +index 0000000..c92330a +--- /dev/null ++++ b/annc/tensorflow/tf_annc_optimizer.patch +@@ -0,0 +1,221 @@ ++diff --git a/tensorflow/core/BUILD b/tensorflow/core/BUILD ++index 538574360ba..b43e0455802 100644 ++--- a/tensorflow/core/BUILD +++++ b/tensorflow/core/BUILD ++@@ -629,6 +629,7 @@ cc_library( ++ "//tensorflow/core/kernels/linalg", ++ "//tensorflow/core/kernels/sparse:kernels", ++ "//tensorflow/core/kernels/uniform_quant_ops:kernels", +++ "//tensorflow/core/kernels:embedding_fused_ops", ++ ] + if_mkl([ ++ "//tensorflow/core/kernels/mkl:mkl_concat_op", ++ "//tensorflow/core/kernels/mkl:mkl_dequantize_op", ++diff --git a/tensorflow/core/grappler/optimizers/BUILD b/tensorflow/core/grappler/optimizers/BUILD ++index ecd559734ea..97a918ead6d 100644 ++--- a/tensorflow/core/grappler/optimizers/BUILD +++++ b/tensorflow/core/grappler/optimizers/BUILD ++@@ -880,6 +880,18 @@ tf_cuda_cc_test( ++ ], ++ ) ++ +++alias( +++ name = "is_aarch64", +++ actual = "@local_xla//xla/service/cpu:is_aarch64", +++ visibility = ["//visibility:public"], +++) +++ +++alias( +++ name = "aarch64_and_annc_disabled", +++ actual = "@local_xla//xla/service/cpu:aarch64_and_annc_disabled", +++ visibility = ["//visibility:public"], +++) +++ ++ tf_kernel_library( ++ name = "remapper", ++ srcs = ["remapper.cc"], ++@@ -904,7 +916,12 @@ tf_kernel_library( ++ "//tensorflow/core/grappler/utils:symbolic_shapes", ++ "//tensorflow/core/grappler/utils:topological_sort", ++ "@com_google_absl//absl/container:flat_hash_set", ++- ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util"]), +++ ] + if_mkl(["//tensorflow/core/graph:mkl_graph_util"]) +++ + select({ +++ ":aarch64_and_annc_disabled": [], +++ ":is_aarch64": ["//tensorflow/core/grappler/optimizers/graph_optimizer:annc_graph_opt"], +++ "//conditions:default": [], +++ }) ++ ) ++ ++ tf_cuda_cc_test( ++diff --git a/tensorflow/core/grappler/optimizers/remapper.cc b/tensorflow/core/grappler/optimizers/remapper.cc ++index 3c37150f496..2fbde836d1d 100644 ++--- a/tensorflow/core/grappler/optimizers/remapper.cc +++++ b/tensorflow/core/grappler/optimizers/remapper.cc ++@@ -52,6 +52,7 @@ limitations under the License. ++ #include "third_party/gpus/cudnn/cudnn.h" ++ #endif // GOOGLE_CUDA ++ +++#include "tensorflow/core/grappler/optimizers/graph_optimizer/graph_opt.h" ++ namespace tensorflow { ++ namespace grappler { ++ ++@@ -4596,6 +4597,7 @@ bool RequiresInferredShapes(const RemapperContext& ctx, int node_index, ++ Status Remapper::Optimize(Cluster* cluster, const GrapplerItem& item, ++ GraphDef* optimized_graph) { ++ GrapplerItem mutable_item = item; +++ annc::run_graph_optimization(&mutable_item.graph); ++ Status status; ++ RemapperContext ctx(&mutable_item, &status, cpu_layout_conversion_, ++ xla_auto_clustering_on_); ++diff --git a/tensorflow/core/kernels/BUILD b/tensorflow/core/kernels/BUILD ++index 22617ef8ab5..f89a084f94c 100644 ++--- a/tensorflow/core/kernels/BUILD +++++ b/tensorflow/core/kernels/BUILD ++@@ -3655,6 +3655,97 @@ tf_kernel_library( ++ ]) + [":fft_impl"], ++ ) ++ +++tf_kernel_library( +++ name = "embedding_fused_action_id_gather_op", +++ srcs = ["embedding_fused_action_id_gather.cc"], +++ deps = MATH_DEPS, +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_gather_op", +++ srcs = ["embedding_fused_gather.cc"], +++ deps = MATH_DEPS, +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_padding_op", +++ srcs = ["embedding_fused_padding.cc"], +++ deps = MATH_DEPS, +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_sparse_dynamic_stitch_op", +++ srcs = ["embedding_fused_sparse_dynamic_stitch.cc"], +++ deps = MATH_DEPS, +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_reshape_op", +++ srcs = ["embedding_fused_sparse_reshape.cc"], +++ deps = MATH_DEPS + [ +++ ":reshape_util", +++ ], +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_sparse_segment_reduce_op", +++ srcs = ["embedding_fused_sparse_segment_reduce.cc"], +++ deps = MATH_DEPS, +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_sparse_segment_reduce_nonzero_op", +++ srcs = ["embedding_fused_sparse_segment_reduce_nonzero.cc"], +++ deps = MATH_DEPS + ["@com_google_absl//absl/container:flat_hash_map"], +++) +++ +++tf_kernel_library( +++ name = "embedding_fused_sparse_select_op", +++ srcs = ["embedding_fused_sparse_select.cc"], +++ deps = MATH_DEPS, +++) +++ +++cc_library( +++ name = "embedding_fused_ops", +++ deps = [ +++ ":embedding_fused_action_id_gather_op", +++ ":embedding_fused_gather_op", +++ ":embedding_fused_padding_op", +++ ":embedding_fused_sparse_dynamic_stitch_op", +++ ":embedding_fused_reshape_op", +++ ":embedding_fused_sparse_segment_reduce_op", +++ ":embedding_fused_sparse_segment_reduce_nonzero_op", +++ ":embedding_fused_sparse_select_op", +++ ], +++) +++ +++tf_cc_test( +++ name = "embedding_fused_ops_test", +++ size = "small", +++ srcs = [ +++ "embedding_fused_action_id_gather_test.cc", +++ "embedding_fused_sparse_dynamic_stitch_test.cc", +++ "embedding_fused_sparse_segment_reduce_test.cc", +++ "embedding_fused_sparse_segment_reduce_nonzero_test.cc", +++ "embedding_fused_padding_test.cc", +++ "embedding_fused_sparse_select_test.cc", +++ "embedding_fused_gather_test.cc", +++ "embedding_fused_sparse_reshape_test.cc", +++ ], +++ deps = [ +++ ":ops_testutil", +++ ":ops_util", +++ ":embedding_fused_ops", +++ "//tensorflow/core:core_cpu", +++ "//tensorflow/core:framework", +++ "//tensorflow/core:lib", +++ "//tensorflow/core:protos_all_cc", +++ "//tensorflow/core:test", +++ "//tensorflow/core:test_main", +++ "//tensorflow/core:testlib", +++ ], +++) +++ ++ tf_kernel_library( ++ name = "reduction_ops", ++ gpu_srcs = ["reduction_gpu_kernels.cu.h"], ++diff --git a/tensorflow/core/ops/BUILD b/tensorflow/core/ops/BUILD ++index 91d80b6c2b5..b00b2a5d027 100644 ++--- a/tensorflow/core/ops/BUILD +++++ b/tensorflow/core/ops/BUILD ++@@ -55,6 +55,7 @@ tf_gen_op_libs( ++ "decode_proto_ops", ++ "encode_proto_ops", ++ "experimental_dataset_ops", +++ "embedding_fused_ops", ++ "filesystem_ops", ++ "function_ops", ++ "functional_ops", ++@@ -323,6 +324,7 @@ cc_library( ++ ":training_ops_op_lib", ++ ":uniform_quant_ops_op_lib", ++ ":word2vec_ops", +++ ":embedding_fused_ops_op_lib", ++ ] + select({ ++ # Non-tpu platforms don't need tpu dependency. ++ "//tensorflow:chromiumos": [], ++diff --git a/third_party/xla/xla/service/cpu/BUILD b/third_party/xla/xla/service/cpu/BUILD ++index 6e0ea613435..47c346b4e93 100644 ++--- a/third_party/xla/xla/service/cpu/BUILD +++++ b/third_party/xla/xla/service/cpu/BUILD ++@@ -198,6 +198,23 @@ cc_library( ++ ], ++ ) ++ +++config_setting( +++ name = "disable_annc", +++ define_values = {"disable_annc": "false"}, +++ visibility = ["//visibility:public"], +++) +++ +++config_setting( +++ name = "is_aarch64", +++ constraint_values = ["@platforms//cpu:aarch64"], +++) +++ +++config_setting( +++ name = "aarch64_and_annc_disabled", +++ constraint_values = ["@platforms//cpu:aarch64"], +++ define_values = {"disable_annc": "true"}, +++) +++ ++ cc_library( ++ name = "cpu_compiler_pure", ++ srcs = ["cpu_compiler.cc"], ++ ++ +-- +2.33.0 + @@ -7,24 +7,30 @@ %global max_jobs 16 -%define debug_package %{nil} +%global debug_package %{nil} Summary: %{name} is an AI compiler designed to optimize and compile ML model into high-performance executable code that can be executed on various targets. Name: ANNC Version: 0.0.2 -Release: 1 +Release: 3 # Package onnxruntime and SafeInt have MIT License. # Package onnx has Apache License 2.0. License: MIT and ASL 2.0 and Boost and BSD -URL: https://gitee.com/openeuler/AI4C +URL: https://gitee.com/openeuler/ANNC Source0: %{pkg_version}.tar.gz -Source1: annc_external.tar.gz.aa -Source2: annc_external.tar.gz.ab +Source1: external.tar.gz.aa +Source2: external.tar.gz.ab +Source3: external.tar.gz.ac +Source4: XNNPACK.tar.gz +Source5: v3.2.tar.gz %ifarch x86_64 Patch0: x86_64_external_files.patch %endif +Patch1: 0001-fix-pattern-conflicts.patch +Patch2: 0002-Add-graph-optimizer-and-embedding_fused-kernels.patch BuildRequires: cmake >= 3.9.9 +BuildRequires: make BuildRequires: gcc BuildRequires: gcc-c++ BuildRequires: bzip2 @@ -43,21 +49,29 @@ BuildRequires: bazel %{name} is is an AI compiler designed to optimize and compile ML model into high-performance executable code that can be executed on various targets. %prep -cat %{SOURCE1} %{SOURCE2} > annc_external.tar.gz -tar -xzf annc_external.tar.gz -C . +cat %{SOURCE1} %{SOURCE2} %{SOURCE3} > external.tar.gz +tar xf external.tar.gz -C . %ifarch x86_64 -%patch 0 -p0 -d . +%patch0 -p1 -d . %endif - -%autosetup -S git -n %{pkg_version} +tar xf %{SOURCE4} -C . +mkdir proxy +cp %{SOURCE5} ./proxy +%setup -q -n %{pkg_version} +%patch1 -p1 +%patch2 -p1 %build export ANNC=%{build_dir} -cd %{build_dir}/annc/service/cpu/xla/libs -bash xnnpack.sh - -export XNNPACK_BASE="$ANNC/annc/service/cpu/xla/libs" -export XNNPACK_DIR="$XNNPACK_BASE/XNNPACK" +cd %{_builddir}/XNNPACK/build +CFLAGS="-fPIC" CXXFLAGS="-fPIC" cmake .. -DXNNPACK_BUILD_BENCHMARKS=OFF \ + -DXNNPACK_BUILD_TESTS=OFF \ + -DXNNPACK_LIBRARY_TYPE=shared \ + -DCMAKE_BUILD_TYPE=Release +make -j %{max_jobs} +rm -rf $ANNC/annc/service/cpu/xla/libs/libXNNPACK.so +cp %{_builddir}/XNNPACK/build/libXNNPACK.so $ANNC/annc/service/cpu/xla/libs +export XNNPACK_DIR="%{_builddir}/XNNPACK" CPLUS_INCLUDE_PATH+="$ANNC/annc/service/cpu/xla:" CPLUS_INCLUDE_PATH+="$ANNC/annc/service/:" @@ -75,9 +89,12 @@ run_bazel_build() { --verbose_failures \ --action_env="baila=548" \ --define tflite_with_xnnpack=false \ - --copt="-g" \ - --copt="-DNDBUG" \ - annc/service/cpu:libannc.so + --local_ram_resources=512 \ + --distdir=%{_builddir}/proxy \ + annc/service/cpu:libannc.so + # --copt="-g" \ + # --copt="-DNDBUG" \ + # annc/service/cpu:libannc.so } fix_action() { @@ -85,7 +102,7 @@ fix_action() { external_path=$(find . -name "external" | head -n 1) if [ -n "$external_path" ]; then rm -rf $external_path/* - cp -r %{_builddir}/external/* $external_path + cp -LR %{_builddir}/external/* $external_path else echo "Not find external directory." fi @@ -106,12 +123,17 @@ pushd %{build_dir}/python %install install -d %{install_includedir} install %{build_dir}/annc/service/cpu/kdnn_rewriter.h -t %{install_includedir} -install %{build_dir}/install/*.patch -t %{install_includedir} +install %{build_dir}/annc/service/cpu/annc_flags.h -t %{install_includedir} +install %{build_dir}/annc/service/cpu/xla/*.h -t %{install_includedir} +install -d %{install_includedir}/bisheng-cpu +install %{build_dir}/annc/service/bisheng-cpu/*.h -t %{install_includedir}/bisheng-cpu +cp -r %{build_dir}/annc/tensorflow %{install_includedir} +cp -r %{build_dir}/install/* %{install_includedir} install %{build_dir}/python/tensorflow/kernels/* -t %{install_includedir} install -d %{install_libdir} output_path=$(find %{build_dir} -type f -name "libannc.so") install ${output_path} -t %{install_libdir} -install %{build_dir}/annc/service/cpu/xla/libs/XNNPACK/build/*.so -t %{install_libdir} +install %{build_dir}/annc/service/cpu/xla/libs/libXNNPACK.so -t %{install_libdir} pushd %{build_dir}/python %py3_install @@ -121,9 +143,26 @@ pushd %{build_dir}/python %{_libdir}/* %{python3_sitelib}/* /usr/bin/annc-opt -/usr/bin/annc-apply-tf %changelog +* Tue Nov 11 2025 Chenhui Zheng <zhengchenhui1@huawei.com> - 0.0.2-3 +- Type:Update +- ID:NA +- SUG:NA +- DEC: Add graph optimize and embedding_fused kernels. + +* Thu Sep 11 2025 Chenhui Zheng <zhengchenhui1@huawei.com> - 0.0.2-2 +- Type:Fix +- ID:NA +- SUG:NA +- DEC:Fix pattern conflict & missing header files. + +* Fri Aug 22 2025 Chenhui Zheng <zhengchenhui1@huawei.com> - 0.0.2-1 +- Type:Update +- ID:NA +- SUG:NA +- DEC:Release v0.0.2 + * Mon May 12 2025 Chenhui Zheng <zhengchenhui1@huawei.com> - 0.0.1-1 - Type:Init - ID:NA @@ -1,3 +1,6 @@ -871ddedbfbb9aa75c2db497584667f61 ANNC-v0.0.2.tar.gz -a3f0ec5120fa9b65af527332299c9d46 annc_external.tar.gz.aa -f548d6ba0ad0163c0aa3df33250e97c6 annc_external.tar.gz.ab +c727cf97cec102b5399d3f43bcf1dd5a ANNC-v0.0.2.tar.gz +20e5d643ae5e8981686f54d7838a959a XNNPACK.tar.gz +2851e140b6c1c07b44cd4db060f17fbd external.tar.gz.aa +8955b6806f170bbe9a5d6a864e75d2d3 external.tar.gz.ab +88966e263f2f840215f9df1ac34a0389 external.tar.gz.ac +19c62a338990388a31cd2ecb918af855 v3.2.tar.gz diff --git a/x86_64_external_files.patch b/x86_64_external_files.patch index 1f77ebc..7bf4fb3 100644 --- a/x86_64_external_files.patch +++ b/x86_64_external_files.patch @@ -1,3 +1,73 @@ +diff --git a/external/local_config_cc/BUILD b/external/local_config_cc/BUILD +index 6c19fae..a0ea3f8 100755 +--- a/external/local_config_cc/BUILD ++++ b/external/local_config_cc/BUILD +@@ -47,15 +47,15 @@ filegroup( + cc_toolchain_suite( + name = "toolchain", + toolchains = { +- "aarch64|compiler": ":cc-compiler-aarch64", +- "aarch64": ":cc-compiler-aarch64", ++ "k8|compiler": ":cc-compiler-k8", ++ "k8": ":cc-compiler-k8", + "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a", + "armeabi-v7a": ":cc-compiler-armeabi-v7a", + }, + ) + + cc_toolchain( +- name = "cc-compiler-aarch64", ++ name = "cc-compiler-k8", + toolchain_identifier = "local", + toolchain_config = ":local", + all_files = ":compiler_deps", +@@ -72,7 +72,7 @@ cc_toolchain( + + cc_toolchain_config( + name = "local", +- cpu = "aarch64", ++ cpu = "k8", + compiler = "compiler", + toolchain_identifier = "local", + host_system_name = "local", +@@ -80,7 +80,7 @@ cc_toolchain_config( + target_libc = "local", + abi_version = "local", + abi_libc_version = "local", +- cxx_builtin_include_directories = ["/usr/lib/gcc/aarch64-linux-gnu/10.3.1/include", ++ cxx_builtin_include_directories = ["/usr/lib/gcc/x86_64-linux-gnu/10.3.1/include", + "/usr/local/include", + "/usr/include", + "/root/rpmbuild/BUILD/ANNC-v0.0.2/annc/service/cpu/xla", +@@ -91,7 +91,7 @@ cc_toolchain_config( + "/root/rpmbuild/BUILD/XNNPACK/build/pthreadpool-source/include", + "/root/rpmbuild/BUILD/ANNC-v0.0.2/output/e7b069029cc648c50e1b8083cef52b4f/external/local_config_cc", + "/usr/include/c++/10.3.1", +- "/usr/include/c++/10.3.1/aarch64-linux-gnu", ++ "/usr/include/c++/10.3.1/x86_64-linux-gnu", + "/usr/include/c++/10.3.1/backward"], + tool_paths = {"ar": "/usr/bin/ar", + "ld": "/usr/bin/ld", +diff --git a/external/local_config_cc/builtin_include_directory_paths b/external/local_config_cc/builtin_include_directory_paths +index 188e6c9..2c3bcc1 100755 +--- a/external/local_config_cc/builtin_include_directory_paths ++++ b/external/local_config_cc/builtin_include_directory_paths +@@ -4,7 +4,7 @@ changes to it will be reflected in the action cache key. When some of these + paths change, Bazel will make sure to rerun the action, even though none of + declared action inputs or the action commandline changes. + +-/usr/lib/gcc/aarch64-linux-gnu/10.3.1/include ++/usr/lib/gcc/x86_64-linux-gnu/10.3.1/include + /usr/local/include + /usr/include + /root/rpmbuild/BUILD/ANNC-v0.0.2/annc/service/cpu/xla +@@ -15,5 +15,5 @@ declared action inputs or the action commandline changes. + /root/rpmbuild/BUILD/XNNPACK/build/pthreadpool-source/include + /root/rpmbuild/BUILD/ANNC-v0.0.2/output/e7b069029cc648c50e1b8083cef52b4f/external/local_config_cc + /usr/include/c++/10.3.1 +-/usr/include/c++/10.3.1/aarch64-linux-gnu ++/usr/include/c++/10.3.1/x86_64-linux-gnu + /usr/include/c++/10.3.1/backward diff --git a/external/go_sdk/BUILD.bazel b/x86_64/external/go_sdk/BUILD.bazel index 9cf6add..511ddbc 100644 --- a/external/go_sdk/BUILD.bazel @@ -44,68 +114,6 @@ index 81bd76b..560f94f 100644 const version = `go1.18.4` const defaultGOOS = runtime.GOOS const defaultGOARCH = runtime.GOARCH -diff --git a/external/local_config_cc/BUILD b/x86_64/external/local_config_cc/BUILD -index 51949c4..89f56ce 100755 ---- a/external/local_config_cc/BUILD -+++ b/x86_64/external/local_config_cc/BUILD -@@ -47,15 +47,15 @@ filegroup( - cc_toolchain_suite( - name = "toolchain", - toolchains = { -- "aarch64|compiler": ":cc-compiler-aarch64", -- "aarch64": ":cc-compiler-aarch64", -+ "k8|compiler": ":cc-compiler-k8", -+ "k8": ":cc-compiler-k8", - "armeabi-v7a|compiler": ":cc-compiler-armeabi-v7a", - "armeabi-v7a": ":cc-compiler-armeabi-v7a", - }, - ) - - cc_toolchain( -- name = "cc-compiler-aarch64", -+ name = "cc-compiler-k8", - toolchain_identifier = "local", - toolchain_config = ":local", - all_files = ":compiler_deps", -@@ -72,7 +72,7 @@ cc_toolchain( - - cc_toolchain_config( - name = "local", -- cpu = "aarch64", -+ cpu = "k8", - compiler = "compiler", - toolchain_identifier = "local", - host_system_name = "local", -@@ -80,11 +80,11 @@ cc_toolchain_config( - target_libc = "local", - abi_version = "local", - abi_libc_version = "local", -- cxx_builtin_include_directories = ["/usr/lib/gcc/aarch64-openEuler-linux/12/include", -+ cxx_builtin_include_directories = ["/usr/lib/gcc/x86_64-openEuler-linux/12/include", - "/usr/local/include", - "/usr/include", - "/usr/include/c++/12", -- "/usr/include/c++/12/aarch64-openEuler-linux", -+ "/usr/include/c++/12/x86_64-openEuler-linux", - "/usr/include/c++/12/backward"], - tool_paths = {"ar": "/usr/bin/ar", - "ld": "/usr/bin/ld", -diff --git a/external/local_config_cc/builtin_include_directory_paths b/x86_64/external/local_config_cc/builtin_include_directory_paths -index 711ac34..50fea54 100755 ---- a/external/local_config_cc/builtin_include_directory_paths -+++ b/x86_64/external/local_config_cc/builtin_include_directory_paths -@@ -4,9 +4,9 @@ changes to it will be reflected in the action cache key. When some of these - paths change, Bazel will make sure to rerun the action, even though none of - declared action inputs or the action commandline changes. - --/usr/lib/gcc/aarch64-openEuler-linux/12/include -+/usr/lib/gcc/x86_64-openEuler-linux/12/include - /usr/local/include - /usr/include - /usr/include/c++/12 --/usr/include/c++/12/aarch64-openEuler-linux -+/usr/include/c++/12/x86_64-openEuler-linux - /usr/include/c++/12/backward diff --git a/external/local_config_cc_toolchains/BUILD b/x86_64/external/local_config_cc_toolchains/BUILD index db5234f..f9c0875 100755 --- a/external/local_config_cc_toolchains/BUILD |
