{"id":12373,"date":"2024-08-11T19:50:24","date_gmt":"2024-08-11T11:50:24","guid":{"rendered":"https:\/\/ihower.tw\/blog\/?p=12373"},"modified":"2025-07-04T07:22:27","modified_gmt":"2025-07-03T23:22:27","slug":"rag-chunking","status":"publish","type":"post","link":"https:\/\/ihower.tw\/blog\/12373-rag-chunking","title":{"rendered":"\u4f7f\u7528\u7e41\u9ad4\u4e2d\u6587\u8a55\u6e2c RAG \u7684 Chunking \u5207\u584a\u7b56\u7565"},"content":{"rendered":"\n<blockquote class=\"wp-block-quote is-layout-flow wp-block-quote-is-layout-flow\">\n<p>\u60f3\u7cfb\u7d71\u6027\u5b78\u7fd2\u5982\u4f55\u6253\u9020 LLM\u3001RAG \u548c Agents \u61c9\u7528\u55ce? \u6b61\u8fce\u5831\u540d\u6211\u7684\u8ab2\u7a0b\u00a0<a href=\"https:\/\/aihao.tw\/llm\">\u5927\u8a9e\u8a00\u6a21\u578b LLM \u61c9\u7528\u958b\u767c\u5de5\u4f5c\u574a<\/a><\/p>\n<\/blockquote>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"666\" data-attachment-id=\"12403\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/image-9\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1.png\" data-orig-size=\"1178,766\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"image\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-300x195.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-1024x666.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-1024x666.png\" alt=\"\" class=\"wp-image-12403\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-1024x666.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-300x195.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-768x499.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1.png 1178w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<p>\u5ef6\u7e8c\u4e4b\u524d\u505a <a href=\"https:\/\/ihower.tw\/blog\/archives\/12167\">Embedding<\/a> \u548c <a href=\"https:\/\/ihower.tw\/blog\/archives\/12227\">Reranker<\/a> \u8a55\u6e2c\uff0c\u9019\u6b21\u4f86\u7814\u7a76 RAG \u7cfb\u7d71\u4e2d\u7684 Chunking \u5207\u584a\u74b0\u7bc0\u3002\u7531\u65bc embedding \u548c LLM \u6a21\u578b\u7684\u9577\u5ea6\u9650\u5236\uff0c\u6211\u5011\u5fc5\u9808\u5c07\u6240\u6709\u6587\u672c\u8cc7\u6599\uff0c\u62c6\u6210\u5c0f\u584a\u5f8c\u518d\u8f49\u6210\u5411\u91cf\u653e\u9032\u5411\u91cf\u8cc7\u6599\u5eab\u3002<\/p>\n\n\n\n<p>\u4e03\u6708\u4efd Chroma \u505a\u7684\u9019\u7bc7\u975e\u5e38\u68d2 <a href=\"https:\/\/research.trychroma.com\/evaluating-chunking\">Evaluating Chunking Strategies for Retrieval<\/a>\uff0c\u8a55\u6e2c\u4e86\u5e7e\u500b Chunking \u7b56\u7565\uff0c\u4e26\u4e14\u63d0\u51fa\u5169\u7a2e\u65b0\u7684\u5207\u584a\u7b56\u7565\uff0c\u60f3\u7576\u7136\u4ed6\u662f\u7528\u82f1\u6587\u6587\u672c\u505a\u7684\u3002<\/p>\n\n\n\n<p>\u56e0\u70ba\u4ed6\u6709\u516c\u958b <a href=\"https:\/\/github.com\/brandonstarxel\/chunking_evaluation\">Github Repo<\/a> \u7a0b\u5f0f\u78bc\u53ef\u4ee5\u91cd\u73fe\u4ed6\u7684\u5be6\u9a57(\u975e\u5e38\u68d2\uff0c\u662f\u771f\u7684\u53ef\u4ee5\u9806\u5229\u57f7\u884c\u7684)\uff0c\u56e0\u6b64\u6211\u5c31\u6539\u6210\u7528\u7e41\u9ad4\u4e2d\u6587\u6587\u672c\u8a66\u8a66\uff0c\u6392\u5217\u7d44\u5408\u51fa\u8a55\u6e2c 38 \u7a2e\u4e0d\u540c chunking \u7684\u65b9\u5f0f\u3002<\/p>\n\n\n\n<!--more-->\n\n\n\n<h2 class=\"wp-block-heading\">\u8a55\u4f30\u5be6\u9a57\u65b9\u5f0f<\/h2>\n\n\n\n<p>\u9996\u5148\u6839\u64da\u6587\u672c\u4f86\u5408\u6210\u6e2c\u8a66\u554f\u984c\uff0c\u4ed6\u662f\u5f9e\u4f60\u7d66\u5b9a\u7684\u6587\u672c\u4e2d\u96a8\u6a5f\u6311 4000 tokens\uff0c\u7522\u751f\u554f\u984c\u548c\u81f3\u591a\u4e94\u500b references \u53c3\u8003\u4f86\u6e90\u3002\u4e0d\u9700\u8981\u7522\u751f\u7b54\u6848\uff0c\u56e0\u70ba\u9019\u500b\u8a55\u4f30\u662f\u505a\u6aa2\u7d22\uff0c\u4e0d\u662f\u554f\u7b54\uff0c\u4e0d\u6d89\u53ca\u7b54\u6848\u7684\u751f\u6210\u3002\u7136\u5f8c\u4ed6\u9084\u6703\u522a\u9664\u985e\u4f3c\u7684\u3001\u4e0d\u597d\u7684\u554f\u984c\u3002\u56e0\u6b64\u9700\u8981\u591a\u8dd1\u5e7e\u6b21\u624d\u80fd\u6e4a\u6eff\u8db3\u5920\u7684\u984c\u76ee\u3002<\/p>\n\n\n\n<p>\u7528\u4e0d\u540c chunking \u7b56\u7565\u5207\u584a\u5f8c\uff0c\u7528 OpenAI text-embedding-3-large \u6aa2\u7d22\u51fa\u524d5\u7b46\/10\u7b46\/20\u7b46\uff0c\u8a08\u7b97\u6aa2\u7d22\u5f8c\u7684 Recall \u548c Precision \u6307\u6a19\uff0c\u800c\u4e14\u9019\u662f\u5f9e tokens \u5c64\u9762\u4f86\u8a08\u7b97\u7684\uff0c\u8d85\u4ed4\u7d30\u3002<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"437\" data-attachment-id=\"12374\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/recall_precision\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision.png\" data-orig-size=\"2062,880\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"recall_precision\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-300x128.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-1024x437.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-1024x437.png\" alt=\"\" class=\"wp-image-12374\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-1024x437.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-300x128.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-768x328.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-1536x656.png 1536w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-2048x874.png 2048w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/recall_precision-1568x669.png 1568w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Recall \u53ec\u56de\u7387: \u6709\u591a\u5c11\u76f8\u95dc\u7684 tokens \u88ab\u6aa2\u7d22\u51fa\u4f86<\/li>\n\n\n\n<li>Precision \u6e96\u78ba\u7387: \u6aa2\u7d22\u51fa\u4f86\u7684 tokens \u4e2d\uff0c\u6709\u591a\u5c11\u662f\u771f\u6b63\u76f8\u95dc\u7684<\/li>\n\n\n\n<li>Precision\u03a9 \u6700\u5927\u6e96\u78ba\u7387: \u5047\u8a2d Recall \u662f\u6eff\u5206\uff0c\u6700\u5927\u7684\u6e96\u78ba\u7387\u662f\u591a\u5c11<\/li>\n\n\n\n<li>IoU: \u9019\u662f\u4f5c\u8005\u767c\u660e\u7684\u7d9c\u5408\u6027\u6307\u6a19 (\u8ddf F1 Score \u6709\u9ede\u985e\u4f3c)\uff0c\u9ad8\u5206\u4ee3\u8868\u4e0d\u4f46\u80fd\u5920\u6e96\u78ba\u6aa2\u7d22\u76f8\u95dc\u5167\u5bb9\uff0c\u540c\u6642\u6700\u5c0f\u5316\u4e0d\u76f8\u95dc\u6216\u5197\u9918\u7684\u5167\u5bb9\u3002\u4f4e\u5206\u4ee3\u8868\u904e\u591a\u7121\u95dc\u5167\u5bb9\uff0c\u6216\u662f\u907a\u6f0f\u4e86\u76f8\u95dc\u8cc7\u8a0a\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">Chunking \u7b56\u7565\u8aaa\u660e<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>FixedTokenChunker: \u56fa\u5b9a\u7167 tokens \u786c\u5207 \u3002\u4e0d\u904e\u4ed6\u7684\u9019\u500b\u5be6\u4f5c\u5728\u4e2d\u6587\u4e0d\u80fd\u7528\uff0c\u56e0\u70ba\u4e2d\u6587\u4e0d\u662f\u525b\u597d\u4e00\u500b token\uff0c\u4ed6\u786c\u5207 tokens \u7684\u7d50\u679c\u9020\u6210\u932f\u8aa4\uff0c\u6839\u672c\u4e0d\u80fd\u8dd1\u3002<\/li>\n\n\n\n<li>RecursiveTokenChunker: \u5c31\u662f\u6700\u591a\u4eba\u7528\u7684 <a href=\"https:\/\/python.langchain.com\/v0.1\/docs\/modules\/data_connection\/document_transformers\/recursive_text_splitter\/\">RecursiveCharacterTextSplitter<\/a> \u5207\u6cd5\uff0c\u6211\u6709\u7a0d\u6539\u4e00\u4e0b\u5206\u9694\u7b26\u865f\uff0c\u52a0\u4e0a\u4e2d\u6587\u7684\u9017\u865f\u53e5\u865f\u7b49\u7b49\u3002\u9019\u500b chunking \u65b9\u5f0f\u9664\u4e86\u6307\u5b9a chunk \u5927\u5c0f\uff0c\u9084\u53ef\u4ee5\u6307\u5b9a\u8981\u524d\u5f8c\u91cd\u8907\u591a\u5c11 tokens\uff0c\u4ee5\u6539\u9032\u4e0a\u4e0b\u6587\u8a9e\u610f\u3002\u7e3d\u5171\u6e2c\u8a66\u4e86 2000, 1500, 1000, 800, 600, 400, 200 \u5207\u584a\u5927\u5c0f\u4ee5\u53ca\u662f\u5426\u91cd\u758a\u3002<\/li>\n\n\n\n<li>KamradtModifiedChunker: \u7531 <a href=\"https:\/\/github.com\/FullStackRetrieval-com\/RetrievalTutorials\/blob\/main\/tutorials\/LevelsOfTextSplitting\/5_Levels_Of_Text_Splitting.ipynb\">Kamradt<\/a> \u63d0\u51fa\u7684\u6bd4\u8f03\u6bcf\u6bb5 embedding \u5dee\u7570\u4f86\u8070\u660e\u5207\u584a\u7684\u65b9\u5f0f\uff0c\u6211\u6e2c\u8a66\u4e86 2000, 1500, 1000, 800, 600, 400, 200 \u5207\u584a\u5927\u5c0f\u3002<\/li>\n\n\n\n<li>ClusterSemanticChunker: \u6839\u64da embedding \u4f86\u5206\u7fa4\uff0c\u4e00\u6a23\u6e2c\u8a66\u4e86 2000, 1500, 1000, 800, 600, 400, 200 \u5207\u584a\u5927\u5c0f\u3002<\/li>\n\n\n\n<li>LLMSemanticChunker: \u7528 LLM \u4f86\u5e6b\u6211\u5011\u5207\u584a\uff0c\u6a19\u8a18\u54ea\u6bb5\u8ddf\u54ea\u6bb5\u8981\u62c6\u958b\u3002\u4ed6\u4e0b\u7684<a href=\"https:\/\/github.com\/brandonstarxel\/chunking_evaluation\/blob\/main\/chunking_evaluation\/chunking\/llm_semantic_chunker.py\"> prompt \u50b3\u9001\u9580<\/a>\u3002<\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\">\u6211\u4f7f\u7528\u7684\u4e2d\u6587\u6e2c\u8a66\u6587\u672c<\/h2>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u79d1\u6280: \u6211 Blog \u4e2d\u7684\u4e00\u7bc7<a href=\"https:\/\/ihower.tw\/blog\/archives\/1750\">\u654f\u6377\u7cfb\u5217\u6587\u7ae0<\/a><\/li>\n\n\n\n<li>\u6295\u8cc7: \u67d0\u4e00\u7bc7 \u4e2d\u570b\u4fe1\u8a17 \u7d93\u6fdf\u91d1\u878d\u60c5\u52e2\u9031\u5831PDF<\/li>\n\n\n\n<li>\u79d1\u666e: \u5beb\u9ede\u79d1\u666e\u7684 <a href=\"https:\/\/kopu.chat\/hbm-cowos-nvidia-2024\">\u5f9e\u840c\u82bd\u5230\u5dd4\u5cf0\uff1aHBM\u8a18\u61b6\u9ad4\uff0c\u53f0\u7a4d\u96fb\u8207NVIDIA\u6210\u738b\u4e4b\u8def <\/a><\/li>\n\n\n\n<li>\u6cd5\u5f8b: <a href=\"https:\/\/law.moj.gov.tw\/LawClass\/LawAll.aspx?pcode=N0030001\">\u52de\u52d5\u57fa\u6e96\u6cd5<\/a><\/li>\n\n\n\n<li>\u5c0f\u8aaa: <a href=\"http:\/\/www.b111.net\/novel\/0\/752\/index.html#google_vignette\">\u7b2c\u4e00\u6b21\u7684\u89aa\u5bc6\u63a5\u89f8<\/a> (\u524d\u5169\u96c6)<\/li>\n<\/ul>\n\n\n\n<p>\u6bcf\u7bc7\u5927\u7d04 1~2 \u842c tokens (cl100k_base)\uff0c\u4e7e\u6de8\u7684\u7d14\u6587\u5b57\u6709\u6bb5\u843d\u5206\u884c\uff0c\u6700\u5f8c\u6211\u5408\u6210\u51fa<a href=\"https:\/\/gist.github.com\/ihower\/6da41c8809aa90abf04eefdebb013ea8\">100\u984c<\/a>\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u5408\u6210 Prompt \u4fee\u6539<\/h2>\n\n\n\n<p>\u6211\u6709\u4fee\u6539\u4ed6\u7684 prompt\uff0c\u8981\u6c42\u5408\u6210\u51fa\u4f86\u7684\u554f\u984c\uff0c\u4e5f\u5fc5\u9808\u662f\u7e41\u9ad4\u4e2d\u6587: This question must be in Traditional Chinese (as used in Taiwan).<br>\u53e6\u5916\u70ba\u4e86\u7bc0\u7701\u6210\u672c\uff0c\u6a21\u578b\u63db\u6210 gpt-4o-2024-08-06\uff0c\u4ed6\u672c\u4f86\u7528 gpt-4-turbo\u3002<\/p>\n\n\n\n<h2 class=\"wp-block-heading\">\u8a55\u4f30\u7d50\u679c<\/h2>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"666\" data-attachment-id=\"12403\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/image-9\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1.png\" data-orig-size=\"1178,766\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"image\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-300x195.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-1024x666.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-1024x666.png\" alt=\"\" class=\"wp-image-12403\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-1024x666.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-300x195.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1-768x499.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-1.png 1178w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"673\" data-attachment-id=\"12404\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/image-10\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2.png\" data-orig-size=\"1177,774\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"image\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2-300x197.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2-1024x673.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2-1024x673.png\" alt=\"\" class=\"wp-image-12404\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2-1024x673.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2-300x197.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2-768x505.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-2.png 1177w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"679\" data-attachment-id=\"12407\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/image-13\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5.png\" data-orig-size=\"1174,778\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"image\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5-300x199.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5-1024x679.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5-1024x679.png\" alt=\"\" class=\"wp-image-12407\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5-1024x679.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5-300x199.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5-768x509.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-5.png 1174w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"665\" data-attachment-id=\"12406\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/image-12\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4.png\" data-orig-size=\"1176,764\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"image\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4-300x195.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4-1024x665.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4-1024x665.png\" alt=\"\" class=\"wp-image-12406\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4-1024x665.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4-300x195.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4-768x499.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-4.png 1176w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<figure class=\"wp-block-image size-large\"><a href=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6.png\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"728\" data-attachment-id=\"12408\" data-permalink=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\/image-14\" data-orig-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6.png\" data-orig-size=\"1607,1143\" data-comments-opened=\"1\" data-image-meta=\"{&quot;aperture&quot;:&quot;0&quot;,&quot;credit&quot;:&quot;&quot;,&quot;camera&quot;:&quot;&quot;,&quot;caption&quot;:&quot;&quot;,&quot;created_timestamp&quot;:&quot;0&quot;,&quot;copyright&quot;:&quot;&quot;,&quot;focal_length&quot;:&quot;0&quot;,&quot;iso&quot;:&quot;0&quot;,&quot;shutter_speed&quot;:&quot;0&quot;,&quot;title&quot;:&quot;&quot;,&quot;orientation&quot;:&quot;0&quot;}\" data-image-title=\"image\" data-image-description=\"\" data-image-caption=\"\" data-medium-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-300x213.png\" data-large-file=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-1024x728.png\" src=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-1024x728.png\" alt=\"\" class=\"wp-image-12408\" srcset=\"https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-1024x728.png 1024w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-300x213.png 300w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-768x546.png 768w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-1536x1093.png 1536w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6-1568x1115.png 1568w, https:\/\/ihower.tw\/blog\/wp-content\/uploads\/2024\/08\/image-6.png 1607w\" sizes=\"auto, (max-width: 1024px) 100vw, 1024px\" \/><\/a><\/figure>\n\n\n\n<h2 class=\"wp-block-heading\">\u4e00\u4e9b\u7d50\u8ad6\u60f3\u6cd5<\/h2>\n\n\n\n<p>\u6211\u6703\u512a\u5148\u770b Recall \u6307\u6a19\uff0c\u7562\u7adf\u6b63\u78ba\u7684\u53c3\u8003\u8cc7\u6599\u90fd\u6c92\u6aa2\u7d22\u51fa\u4f86\u7684\u8a71\uff0c\u5f8c\u9762 LLM \u8981\u751f\u6210\u4e5f\u5c31 gg \u4e86\u3002<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>\u5728 top-k \u662f 5 \u7684\u60c5\u6cc1\u4e0b\n<ul class=\"wp-block-list\">\n<li>RecursiveTokenChunker \u7684 chunk \u5207\u592a\u5927 1500 \u4ee5\u4e0a\uff0c\u6216\u592a\u5c0f 600 \u4ee5\u4e0b\u770b\u4f86\u4e0d\u884c<\/li>\n\n\n\n<li>RecursiveTokenChunker_800_400 \u9019\u500b\u914d\u65b9\u770b\u4f86\u662f OK \u7684<\/li>\n<\/ul>\n<\/li>\n\n\n\n<li>\u82e5\u5728\u76f8\u540c\u7e3d chunk tokens \u6578\u7684\u689d\u4ef6\u4e0b\uff0c\u5982\u679c\u5207\u5c0f\u584a\u4f46\u662f top-k \u53d6\u9ad8\u7684\u8a71\uff0c\u4f8b\u5982 RecursiveTokenChunker_400_200 \u7528 top-k 10\uff0c\u5247 Recall \u53ef\u4ee5\u66f4\u9ad8! \u4f46\u4e5f\u4e0d\u80fd\u5207\u592a\u5c0f\u584a\u3002<\/li>\n\n\n\n<li>\u6240\u4ee5 chunking \u7684\u5927\u5c0f\uff0c\u4e5f\u8ddf\u5230\u5e95\u8981\u6aa2\u7d22\u591a\u5c11\u7b46\u7684 top-k \u6709\u95dc\uff0c\u7e3d chunk tokens \u80fd\u7528\u591a\u5c11\u8ddf\u4f60 LLM context window \u9650\u5236\u6709\u95dc<\/li>\n\n\n\n<li>\u6709\u7528 chunk overlap \u6a19\u6e96\u5dee\u6703\u6bd4\u8f03\u5c0f\uff0c\u7d50\u679c\u6703\u6bd4\u8f03\u7a69\u5b9a\uff0c\u4ee3\u50f9\u662f Precision \u6bd4\u8f03\u5dee\u4e00\u9ede\uff0c\u96dc\u8a0a\u8b8a\u591a\uff0c\u6bd4\u8f03\u6d6a\u8cbb\u6210\u672c\uff0c\u5c0d\u6a21\u578b\u751f\u6210\u4e5f\u662f\uff0c\u4f8b\u5982\u53ef\u80fd\u6aa2\u7d22\u51fa 4 \u500b chunks\uff0c\u4f46\u88e1\u9762\u5feb\u4e00\u534a\u90fd\u662f\u91cd\u8907\u7684\u6587\u5b57\u2026. XD<\/li>\n\n\n\n<li>(\u5728 top-k 5 \u7684\u60c5\u6cc1\u4e0b) KamradtModifiedChunker \u8ddf ClusterSemanticChunker \u505a\u51fa\u4f86\u6c92\u60f3\u50cf\u4e2d\u597d\uff0c\u539f\u8a55\u6e2c\u7528\u82f1\u6587\u662f\u9ad8\u5206\u7684\uff0c\u4f46\u9019\u88e1\u6211\u505a\u4e2d\u6587\u6bd4\u8f03\u5dee\u3002\u500b\u4eba\u731c\u6e2c\u662f\u9019\u500b\u4f9d\u8cf4\u65bc embedding \u6a21\u578b\u80fd\u529b\uff0c\u800c\u4e2d\u6587\u7684 embedding \u80fd\u529b\u7562\u7adf\u9084\u662f\u5dee\u4e86\u4e00\u9ede<\/li>\n\n\n\n<li>LLMSemanticChunker \u9084\u4e0d\u932f\uff0cPrecision \u6bd4 RecursiveTokenChunker_800_400 \u9ad8\uff0c\u4f46\u539f\u8a55\u6e2c\u7528\u9019\u62db\u7684 Recall \u5206\u6578\u662f\u7b2c\u4e00\uff0c\u63db\u6210\u4e2d\u6587\u5f8c\u6027\u80fd\u4e5f\u662f\u6709\u6240\u4e0b\u964d\u3002\u5982\u679c\u6587\u4ef6\u672c\u4f86\u5c31\u6709\u7528 LLM \u6e05\u7406\u8cc7\u6599\uff0c\u6211\u6703\u8003\u616e\u9806\u9053\u8a66\u8a66\u9019\u62db\u3002<\/li>\n<\/ul>\n\n\n\n<p>\u5728 Recall \u5dee\u4e0d\u591a\u7684\u60c5\u6cc1\u4e0b\uff0c\u53ef\u4ee5\u9078 Precision \u9ad8\u7684\uff0c\u6bd4\u8f03\u6709\u6548\u7387\u3002\u7562\u7adf\u5f8c\u9762 LLM \u751f\u6210\u6642\uff0c\u5728\u4e00\u6a23\u7684 context window \u9650\u5236\u4e0b: \u55ae\u4e00 chunk \u5207\u8d8a\u5927\uff0c\u6700\u5f8c\u80fd\u53d6\u7684 chunks \u7e3d\u6578\u5c31\u6bd4\u8f03\u5c11\uff0c\u4f8b\u5982 top-k 5\uff0c\u56e0\u70ba\u4e00\u584a\u5c31\u9019\u9ebc\u5927\u3002\u5207\u8d8a\u5c0f\uff0c\u6700\u5f8c\u53d6\u7684 chunks \u7e3d\u6578\u5c31\u6bd4\u8f03\u591a\uff0c\u4f8b\u5982 top-k 10\uff0c\u6bd4\u8f03\u6709\u5f48\u6027\u7a7a\u9593\u3002<\/p>\n\n\n\n<p>\u81f3\u65bc\u662f\u5426\u8981\u72a7\u7272 Recall \u63db Precision\uff0c\u6211\u8a8d\u70ba\u8ddf\u4f60\u6a21\u578b\u5f37\u4e0d\u5f37\u3001context window \u53ef\u4ee5\u585e\u591a\u5c11 chunks \u6709\u95dc:<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>Precision \u8ddf\u5207\u584a\u5927\u5c0f\u662f\u975e\u5e38\u76f8\u95dc\uff0c\u5207\u8d8a\u5c0f Precision \u8d8a\u6e96\uff0c\u5207\u8d8a\u5927 Precision \u8d8a\u5dee<\/li>\n\n\n\n<li>\u5982\u679c\u662f\u80fd\u529b\u5f37\u7684\u5927\u6a21\u578b\uff0cPrecision \u4f4e\u4e00\u9ede\uff0c\u6709\u96dc\u8a0a\u4e5f\u6c92\u95dc\u4fc2<\/li>\n\n\n\n<li>\u5982\u679c\u662f\u80fd\u529b\u5f31\u7684\u5c0f\u6a21\u578b\uff0c\u5bb9\u6613\u88ab\u96dc\u8a0a\u5f71\u97ff\uff0cPrecision \u592a\u4f4e\u751f\u6210\u6050\u6703\u4e0d\u597d\uff0c\u56e0\u6b64\u5207\u584a\u9700\u8981\u5c0f<\/li>\n<\/ul>\n\n\n\n<p>\u6700\u5f8c\uff0c\u4f5c\u8005\u7684\u9019\u500b\u5207\u584a\u8a55\u4f30\u6846\u67b6\u883b\u4e0d\u932f\u7684\uff0c\u5982\u679c\u6709\u5be6\u969b\u5c08\u6848\u7528\u7684\u6587\u672c\u63a8\u85a6\u53ef\u4ee5\u81ea\u5df1\u8dd1\u4e00\u8dd1\uff0c\u770b\u770b\u9069\u5408\u4f60\u7684\u6700\u4f73 Chunking \u7b56\u7565\u662f\u4ec0\u9ebc\u3002<\/p>\n\n\n\n<p><a href=\"https:\/\/www.facebook.com\/ihower\/posts\/10161305867168971\">\u6b64\u8a55\u6e2c Facebook \u8cbc\u6587\u8a0e\u8ad6 \u50b3\u9001\u9580 \u2197\ufe0f<\/a>&nbsp;(\u6b61\u8fce\u6309\u8b9a\u3001\u8ffd\u8e64\u3001\u5206\u4eab)<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u60f3\u7cfb\u7d71\u6027\u5b78\u7fd2\u5982\u4f55\u6253\u9020 LLM\u3001RAG \u548c Agents \u61c9\u7528\u55ce? \u6b61\u8fce\u5831\u540d\u6211\u7684\u8ab2\u7a0b\u00a0\u5927\u8a9e\u8a00\u6a21\u578b LLM \u61c9\u7528\u958b &hellip; <\/p>\n<p class=\"link-more\"><a href=\"https:\/\/ihower.tw\/blog\/12373-rag-chunking\" class=\"more-link\">\u95b1\u8b80\u5168\u6587<span class=\"screen-reader-text\">\u3008\u4f7f\u7528\u7e41\u9ad4\u4e2d\u6587\u8a55\u6e2c RAG \u7684 Chunking \u5207\u584a\u7b56\u7565\u3009<\/span><\/a><\/p>\n","protected":false},"author":1,"featured_media":0,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"_monsterinsights_skip_tracking":false,"_monsterinsights_sitenote_active":false,"_monsterinsights_sitenote_note":"","_monsterinsights_sitenote_category":0,"jetpack_post_was_ever_published":false,"_jetpack_newsletter_access":"","_jetpack_dont_email_post_to_subs":false,"_jetpack_newsletter_tier_id":0,"_jetpack_memberships_contains_paywalled_content":false,"_jetpack_memberships_contains_paid_content":false,"footnotes":"","jetpack_publicize_message":"","jetpack_publicize_feature_enabled":true,"jetpack_social_post_already_shared":true,"jetpack_social_options":{"image_generator_settings":{"template":"highway","default_image_id":0,"font":"","enabled":false},"version":2}},"categories":[80],"tags":[],"class_list":["post-12373","post","type-post","status-publish","format-standard","hentry","category-llm","entry"],"jetpack_publicize_connections":[],"jetpack_featured_media_url":"","jetpack_shortlink":"https:\/\/wp.me\/p1q6tG-3dz","jetpack_sharing_enabled":true,"jetpack_likes_enabled":true,"_links":{"self":[{"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/posts\/12373","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/users\/1"}],"replies":[{"embeddable":true,"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/comments?post=12373"}],"version-history":[{"count":29,"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/posts\/12373\/revisions"}],"predecessor-version":[{"id":12705,"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/posts\/12373\/revisions\/12705"}],"wp:attachment":[{"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/media?parent=12373"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/categories?post=12373"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/ihower.tw\/blog\/wp-json\/wp\/v2\/tags?post=12373"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}