ggml-org/llama.cpp

Feature Request: Allow Filtering LLama Server Response Fields

Open

#10,819 创建于 2024年12月13日

在 GitHub 查看
 (10 评论) (0 反应) (0 负责人)C++ (110,169 star) (18,202 fork)batch import
enhancementgood first issue

描述

Prerequisites

  • I am running the latest code. Mention the version if possible as well.
  • I carefully followed the README.md.
  • I searched using keywords relevant to my issue to make sure that I am creating a new issue that is not already open (or closed).
  • I reviewed the Discussions, and have a new and useful enhancement to share.

Feature Description

Currently llama.cpp server serializes a lot of data the caller may not care about. Example response:

{
    "index": 0,
    "content": "[\n                {\n                  \"function_name\": \"create_user\",\n                  \"username\": \"my_user\",\n                  \"email\": \"my_email@example.com\",\n                  \"password\": \"password123\"\n                }\n              ]\n            \t\t\t\t\t\t\t\t",
    "id_slot": 0,
    "stop": true,
    "model": "gpt-3.5-turbo-0613",
    "tokens_predicted": 76,
    "tokens_evaluated": 5,
    "generation_settings": {
        "n_predict": -1,
        "seed": 4294967295,
        "temperature": 0.800000011920929,
        "dynatemp_range": 0.0,
        "dynatemp_exponent": 1.0,
        "top_k": 40,
        "top_p": 0.949999988079071,
        "min_p": 0.05000000074505806,
        "xtc_probability": 0.0,
        "xtc_threshold": 0.10000000149011612,
        "typical_p": 1.0,
        "repeat_last_n": 64,
        "repeat_penalty": 1.0,
        "presence_penalty": 0.0,
        "frequency_penalty": 0.0,
        "dry_multiplier": 0.0,
        "dry_base": 1.75,
        "dry_allowed_length": 2,
        "dry_penalty_last_n": -1,
        "dry_sequence_breakers": [
            "\n",
            ":",
            "\"",
            "*"
        ],
        "mirostat": 0,
        "mirostat_tau": 5.0,
        "mirostat_eta": 0.10000000149011612,
        "penalize_nl": false,
        "stop": [],
        "max_tokens": -1,
        "n_keep": 0,
        "n_discard": 0,
        "ignore_eos": false,
        "stream": false,
        "logit_bias": [],
        "n_probs": 0,
        "min_keep": 0,
        "grammar": "char ::= [^\"\\\\\\x7F\\x00-\\x1F] | [\\\\] ([\"\\\\bfnrt] | \"u\" [0-9a-fA-F]{4})\ndecimal-part ::= [0-9]{1,16}\nintegral-part ::= [0] | [1-9] [0-9]{0,15}\nitem ::= item-0 | item-1 | item-2\nitem-0 ::= \"{\" space item-0-function-name-kv \",\" space item-0-items-kv ( \",\" space ( item-0-tax-rate-kv ) )? \"}\" space\nitem-0-function-name ::= \"\\\"calculate_total\\\"\" space\nitem-0-function-name-kv ::= \"\\\"function_name\\\"\" space \":\" space item-0-function-name\nitem-0-items ::= \"[\" space (item-0-items-item (\",\" space item-0-items-item)*)? \"]\" space\nitem-0-items-item ::= \"{\" space item-0-items-item-name-kv \",\" space item-0-items-item-price-kv \"}\" space\nitem-0-items-item-name-kv ::= \"\\\"name\\\"\" space \":\" space string\nitem-0-items-item-price-kv ::= \"\\\"price\\\"\" space \":\" space number\nitem-0-items-kv ::= \"\\\"items\\\"\" space \":\" space item-0-items\nitem-0-tax-rate-kv ::= \"\\\"tax_rate\\\"\" space \":\" space number\nitem-1 ::= \"{\" space item-1-function-name-kv \",\" space item-1-to-kv \",\" space item-1-subject-kv \",\" space item-1-body-kv ( \",\" space ( item-1-attachments-kv ) )? \"}\" space\nitem-1-attachments ::= \"[\" space (item-1-attachments-item (\",\" space item-1-attachments-item)*)? \"]\" space\nitem-1-attachments-item ::= \"{\" space item-1-attachments-item-filename-kv \",\" space item-1-attachments-item-content-kv \"}\" space\nitem-1-attachments-item-content-kv ::= \"\\\"content\\\"\" space \":\" space string\nitem-1-attachments-item-filename-kv ::= \"\\\"filename\\\"\" space \":\" space string\nitem-1-attachments-kv ::= \"\\\"attachments\\\"\" space \":\" space item-1-attachments\nitem-1-body-kv ::= \"\\\"body\\\"\" space \":\" space string\nitem-1-function-name ::= \"\\\"send_email\\\"\" space\nitem-1-function-name-kv ::= \"\\\"function_name\\\"\" space \":\" space item-1-function-name\nitem-1-subject-kv ::= \"\\\"subject\\\"\" space \":\" space string\nitem-1-to-kv ::= \"\\\"to\\\"\" space \":\" space string\nitem-2 ::= \"{\" space item-2-function-name-kv \",\" space item-2-username-kv \",\" space item-2-email-kv \",\" space item-2-password-kv ( \",\" space ( item-2-role-kv ) )? \"}\" space\nitem-2-email-kv ::= \"\\\"email\\\"\" space \":\" space string\nitem-2-function-name ::= \"\\\"create_user\\\"\" space\nitem-2-function-name-kv ::= \"\\\"function_name\\\"\" space \":\" space item-2-function-name\nitem-2-password ::= \"\\\"\" char{8,} \"\\\"\" space\nitem-2-password-kv ::= \"\\\"password\\\"\" space \":\" space item-2-password\nitem-2-role ::= (\"\\\"admin\\\"\" | \"\\\"user\\\"\" | \"\\\"editor\\\"\") space\nitem-2-role-kv ::= \"\\\"role\\\"\" space \":\" space item-2-role\nitem-2-username-kv ::= \"\\\"username\\\"\" space \":\" space string\nnumber ::= (\"-\"? integral-part) (\".\" decimal-part)? ([eE] [-+]? integral-part)? space\nroot ::= \"[\" space item (\",\" space item){0,0} \"]\" space\nspace ::= | \" \" | \"\\n\" [ \\t]{0,20}\nstring ::= \"\\\"\" char* \"\\\"\" space\n",
        "samplers": [
            "dry",
            "top_k",
            "typ_p",
            "top_p",
            "min_p",
            "xtc",
            "temperature"
        ],
        "speculative.n_max": 16,
        "speculative.n_min": 5,
        "speculative.p_min": 0.8999999761581421,
        "timings_per_token": false
    },
    "prompt": "<s> Create a new user",
    "has_new_line": 1,
    "truncated": false,
    "stop_type": "eos",
    "stopping_word": "",
    "tokens_cached": 80,
    "timings": {
        "prompt_n": 1,
        "prompt_ms": 46.258,
        "prompt_per_token_ms": 46.258,
        "prompt_per_second": 21.617882312248693,
        "predicted_n": 76,
        "predicted_ms": 1864.038,
        "predicted_per_token_ms": 24.526815789473684,
        "predicted_per_second": 40.77170100609537
    }
}

It would be more efficient if the request took a requested_fields param that would then filtered on the server side. e.g. "requested_fields" : ["content"] would only return

{
    "content": "[\n                {\n                  \"function_name\": \"create_user\",\n                  \"username\": \"my_user\",\n                  \"email\": \"my_email@example.com\",\n                  \"password\": \"password123\"\n                }\n              ]\n            \t\t\t\t\t\t\t\t",
}

Motivation

Serialization and de-serialization can become a bottleneck in high throughput environments and increase latency.

Possible Implementation

No response

贡献者指南