最美情侣中文字幕电影,在线麻豆精品传媒,在线网站高清黄,久久黄色视频

歡迎光臨散文網(wǎng) 會(huì)員登陸 & 注冊(cè)

TensorRT:動(dòng)態(tài)batch和多batch推理總結(jié)

2023-07-24 17:41 作者:喜歡玩AI的東哥  | 我要投稿

用tensorrt進(jìn)行yolov8的多batch推理:

步驟1:pt轉(zhuǎn)換onnx

轉(zhuǎn)換出來(lái)的onnx具有動(dòng)態(tài)batch和size。

yolov8轉(zhuǎn)換程序
netron查看inputs和outputs,為動(dòng)態(tài)參數(shù)

步驟2:加載engine模型,創(chuàng)建context,確定輸入的固定尺寸

nvinfer1::ICudaEngine* engine_infer = engine_runtime->deserializeCudaEngine(data.get(), length, nullptr);

nvinfer1::IExecutionContext* engine_context = engine_infer->createExecutionContext();


int input_index = engine_infer->getBindingIndex("images"); //1x3x640x640

int output_index = engine_infer->getBindingIndex("output0");? //1


//engine模型動(dòng)態(tài)batch(BATCH_SIZE, 3, width, height)

nvinfer1::Dims inputSize=engine_infer->getBindingDimensions(input_index);

nvinfer1::Dims outputSize = engine_infer->getBindingDimensions(output_index);


std::cout << "輸入的index: " << input_index << " 輸出的num_detections-> " << output_index? << std::endl;


if (engine_context == nullptr)

{

std::cerr << "Failed to create TensorRT Execution Context." << std::endl;

}

//固定context的輸入為(BATCH_SIZE, 3, 640, 640)

engine_context->setBindingDimensions(0, nvinfer1::Dims4(BATCH_SIZE, 3, 640, 640));

inputSize = engine_context->getBindingDimensions(input_index);

outputSize = engine_context->getBindingDimensions(output_index);

步驟3:前處理,多batch輸入圖片

for (size_t j = 0; j < BATCH_SIZE; j++)

{

???????? ///CV2讀圖片

???????? cv::Mat image = images[i*BATCH_SIZE+j];

???????? std::cout << fn[i] << std::endl;

???????? afterScale = true;

???????? int? step =j * INPUT_SIZE * INPUT_SIZE * 3;

????????

????????//preProcess

???????????????????? if (afterScale)

???????????????????? { //方法1:

???????????????????? preprocess(image, h_input);

???????????????????? memcpy(h_inputs+ step, h_input, INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float));

???????????????????? }

???????????????????? else

???????????????????? { //方法2:

???????????????????? factor = preprocess(image, h_input, INPUT_W, INPUT_H, 3);

???????????????????? memcpy(h_inputs + step, h_input, INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float));

???????????????????? }

}

void* buffers[2];

cudaMalloc(&buffers[0], BATCH_SIZE * INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float));? //<- input

cudaMalloc(&buffers[1], BATCH_SIZE * OUTPUT_SIZE * (NUMS_CLASS + 4) * sizeof(float)); //<- num_detections


cudaMemcpy(buffers[0], h_inputs, BATCH_SIZE*INPUT_SIZE * INPUT_SIZE * 3 * sizeof(float), cudaMemcpyHostToDevice);


步驟4:推理

//同步推理

engine_context->executeV2(buffers);

//異步推理

//engine_context->enqueueV2(buffers, stream, start);

cudaMemcpy(h_output, buffers[1], BATCH_SIZE * OUTPUT_SIZE * (NUMS_CLASS + 4) * sizeof(float), cudaMemcpyDeviceToHost);


步驟5:后處理

//postProcess? ??

for (size_t bsi = 0; bsi < BATCH_SIZE; bsi++)

{

int? step =bsi * OUTPUT_SIZE * (NUMS_CLASS + 4);


const int out_rows = NUMS_CLASS + 4; //獲得"output"節(jié)點(diǎn)的rows

const int out_cols = OUTPUT_SIZE; //獲得"output"節(jié)點(diǎn)的cols

const cv::Mat det_output(out_rows, out_cols, CV_32F, (float*)h_output + step);


std::vector<cv::Rect> boxes;

std::vector<int> class_ids;

std::vector<float> confidences;

kNmsThresh = 0.3f;

kConfThresh = 0.2f;

kClassScore = 0.2f;


//方法1:直接得到原圖的bbox尺寸

// 輸出格式是[11,8400], 每列代表一個(gè)框(即最多有8400個(gè)框), 前面4行分別是cx, cy, ow, oh, 后面7行是每個(gè)類別的置信度

for (int i = 0; i < det_output.cols; ++i) {

const cv::Mat classes_scores = det_output.col(i).rowRange(4, 11);//將類別得分取出來(lái)

cv::Point class_id_point;

double score;

cv::minMaxLoc(classes_scores, nullptr, &score, nullptr, &class_id_point);//找到對(duì)應(yīng)得分最大的類別及其坐標(biāo)


// 置信度 0~1之間

if (score > kClassScore) {

const float cx = det_output.at<float>(0, i);

const float cy = det_output.at<float>(1, i);

const float ow = det_output.at<float>(2, i);

const float oh = det_output.at<float>(3, i);

cv::Rect box;

if (afterScale)

{

box.x = static_cast<int>(cx);

box.y = static_cast<int>(cy);

box.width = static_cast<int>(ow);

box.height = static_cast<int>(oh);

}

else

{

//const float scale = std::min(INPUT_H / float(image.rows), INPUT_W / float(image.cols));

//const float factor = 1 / scale;

box.x = static_cast<int>((cx - 0.5 * ow) * factor);

box.y = static_cast<int>((cy - 0.5 * oh) * factor);

box.width = static_cast<int>(ow * factor);

box.height = static_cast<int>(oh * factor);

}

boxes.push_back(box);

class_ids.push_back(class_id_point.y);//class_id_point=point(i,class),class是對(duì)應(yīng)的類別,屬于point.y

confidences.push_back(score);

}

}

// NMS, 消除具有較低置信度的冗余重疊框

std::vector<int> indexes;

cv::dnn::NMSBoxes(boxes, confidences, kConfThresh, kNmsThresh, indexes);


Mat disImage = disPlayImages[i * BATCH_SIZE + bsi];

if (!afterScale)

{

//方法1:

for (size_t i = 0; i < indexes.size(); i++) {

const int index = indexes[i];

const int idx = class_ids[index];

cv::rectangle(disImage, boxes[index], cv::Scalar(0, 0, 255), 2, 8);

cv::rectangle(disImage, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 20),

cv::Point(boxes[index].br().x, boxes[index].tl().y), cv::Scalar(0, 255, 255), -1);

string nameScore = class_names[idx] + "? " + std::to_string(confidences[idx]);

cv::putText(disImage, nameScore, cv::Point(boxes[index].tl().x, boxes[index].tl().y - 10), cv::FONT_HERSHEY_SIMPLEX, 0.5, cv::Scalar(0, 0, 0));

}


std::string savePath = "trt_res/result_" +std::to_string(i)+"_"+ std::to_string(bsi)+ ".jpg";

cv::imwrite(savePath, disImage);

}

else

{

//方法2:得到模型輸出的bbox尺寸,再轉(zhuǎn)換為原圖

std::vector<Bbox> pred_box;

//方法1:

for (size_t i = 0; i < indexes.size(); i++) {

const int index = indexes[i];

const int idx = class_ids[index];


Bbox box;

box.x = boxes[index].x;? //(h_output_1[i * 4 + 2] + h_output_1[i * 4]) / 2.0;

box.y = boxes[index].y;// (h_output_1[i * 4 + 3] + h_output_1[i * 4 + 1]) / 2.0;

box.w = boxes[index].width;// h_output_1[i * 4 + 2] - h_output_1[i * 4];

box.h = boxes[index].height; // h_output_1[i * 4 + 3] - h_output_1[i * 4 + 1];

box.score = confidences[idx];

box.classes = (int)class_ids[index];

pred_box.push_back(box);

}

std::vector<Bbox> out = rescale_box(pred_box, disImage.cols, disImage.rows);

cv::Mat img = renderBoundingBox(disImage, out);

std::string savePath = "trt_res/result_" + std::to_string(i) + "_" + std::to_string(bsi) + ".jpg";

cv::imwrite(savePath, img);

nums++;

}


步驟6:推理速速和顯存使用情況


step1:?jiǎn)蝏atch推理:batchsize=1

顯存使用:原始2.2g,運(yùn)行時(shí)的顯存為3g


每張圖片平均耗時(shí)2.3ms


step2:多batch推理:batchsize=4

顯存使用:原始2.2g,運(yùn)行時(shí)的顯存為3.1g,推理時(shí)間4.7ms



每張圖片平均耗時(shí)1.2ms
推理結(jié)果展示


總結(jié):

? ? ??多batch的優(yōu)勢(shì)在于增加了吞吐量,單batch的優(yōu)勢(shì)降低了延時(shí),保證實(shí)時(shí)性,當(dāng)batchsize取值在一個(gè)合適的位置時(shí),可以保證吞吐量和低延時(shí)的最佳匹配,這里需要根據(jù)項(xiàng)目工藝要求,設(shè)備本身來(lái)確定合適的batchsize。

? ? ? 這里做一個(gè)擴(kuò)展:項(xiàng)目往往不是一個(gè)模型就可以解決的,有時(shí)會(huì)使用多個(gè)模型,每個(gè)模型文件可以被重復(fù)加載成多個(gè)engine,每個(gè)engine卻只能綁定一個(gè)context,我們開(kāi)啟多線程并發(fā)模式使用context進(jìn)行異步推理,可以獲得不錯(cuò)的實(shí)時(shí)性和吞吐量。具體使用方式如下圖所示,根據(jù)不同的設(shè)備,所能承載的模型數(shù)量不同,合理的通過(guò)需求配置設(shè)備和模型,可以使效益最大化。

trt使用結(jié)構(gòu)圖示例



代碼在我的github:https://github.com/dongguazi。


TensorRT:動(dòng)態(tài)batch和多batch推理總結(jié)的評(píng)論 (共 條)

分享到微博請(qǐng)遵守國(guó)家法律
德钦县| 宁强县| 宜黄县| 南平市| 文昌市| 安徽省| 马尔康县| 德清县| 赫章县| 阳曲县| 东宁县| 赫章县| 隆林| 汉阴县| 岳池县| 台东市| 淮滨县| 高邑县| 台中县| 郴州市| 方城县| 巴彦县| 曲周县| 清原| 嫩江县| 察雅县| 方山县| 中超| 竹溪县| 南昌县| 西吉县| 云南省| 志丹县| 德格县| 高雄市| 长治市| 彩票| 杭州市| 武宁县| 景德镇市| 客服|