URL parser fixes/improvements

2017-10-24 13:58:56 +02:00 · 2017-10-24 13:58:56 +02:00 · 047aebdb27
commit 047aebdb27
parent 7d420d884e
3 changed files with 144 additions and 106 deletions
--- a/lib/encode.cpp
+++ b/lib/encode.cpp
@ -1,93 +1,71 @@
 #include "encode.h"
-namespace Encodings {
+namespace Encodings{
  /// Needed for base64_encode function
-  const std::string Base64::chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+  const std::string Base64::chars =
      "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
  /// Helper for base64_decode function
-  inline bool Base64::is_base64(unsigned char c) {
+  inline bool Base64::is_base64(unsigned char c){
    return (isalnum(c) || (c == '+') || (c == '/'));
  }
-  /// Used to base64 encode data. Input is the plaintext as std::string, output is the encoded data as std::string.
+  /// Used to base64 encode data. Input is the plaintext as std::string, output is the encoded data
-  /// \param input Plaintext data to encode.
+  /// as std::string. \param input Plaintext data to encode. \returns Base64 encoded data.
-  /// \returns Base64 encoded data.
+  std::string Base64::encode(std::string const input){
  std::string Base64::encode(std::string const input) {
    std::string ret;
    unsigned int in_len = input.size();
    char quad[4], triple[3];
    unsigned int i, x, n = 3;
-    for (x = 0; x < in_len; x = x + 3) {
+    for (x = 0; x < in_len; x = x + 3){
-      if ((in_len - x) / 3 == 0) {
+      if ((in_len - x) / 3 == 0){n = (in_len - x) % 3;}
-        n = (in_len - x) % 3;
+      for (i = 0; i < 3; i++){triple[i] = '0';}
-      }
+      for (i = 0; i < n; i++){triple[i] = input[x + i];}
-      for (i = 0; i < 3; i++) {
+      quad[0] = chars[(triple[0] & 0xFC) >> 2];                               // FC = 11111100
        triple[i] = '0';
      }
      for (i = 0; i < n; i++) {
        triple[i] = input[x + i];
      }
      quad[0] = chars[(triple[0] & 0xFC) >> 2]; // FC = 11111100
      quad[1] = chars[((triple[0] & 0x03) << 4) | ((triple[1] & 0xF0) >> 4)]; // 03 = 11
      quad[2] = chars[((triple[1] & 0x0F) << 2) | ((triple[2] & 0xC0) >> 6)]; // 0F = 1111, C0=11110
-      quad[3] = chars[triple[2] & 0x3F]; // 3F = 111111
+      quad[3] = chars[triple[2] & 0x3F];                                      // 3F = 111111
-      if (n < 3) {
+      if (n < 3){quad[3] = '=';}
-        quad[3] = '=';
+      if (n < 2){quad[2] = '=';}
-      }
+      for (i = 0; i < 4; i++){ret += quad[i];}
      if (n < 2) {
        quad[2] = '=';
      }
      for (i = 0; i < 4; i++) {
        ret += quad[i];
      }
    }
    return ret;
-  } //base64_encode
+  }// base64_encode
-  /// Used to base64 decode data. Input is the encoded data as std::string, output is the plaintext data as std::string.
+  /// Used to base64 decode data. Input is the encoded data as std::string, output is the plaintext
-  /// \param encoded_string Base64 encoded data to decode.
+  /// data as std::string. \param encoded_string Base64 encoded data to decode. \returns Plaintext
-  /// \returns Plaintext decoded data.
+  /// decoded data.
-  std::string Base64::decode(std::string const & encoded_string) {
+  std::string Base64::decode(std::string const &encoded_string){
    int in_len = encoded_string.size();
    int i = 0;
    int j = 0;
    int in_ = 0;
    unsigned char char_array_4[4], char_array_3[3];
    std::string ret;
-    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) {
+    while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])){
-      char_array_4[i++ ] = encoded_string[in_];
+      char_array_4[i++] = encoded_string[in_];
      in_++;
-      if (i == 4) {
+      if (i == 4){
-        for (i = 0; i < 4; i++) {
+        for (i = 0; i < 4; i++){char_array_4[i] = chars.find(char_array_4[i]);}
          char_array_4[i] = chars.find(char_array_4[i]);
        }
        char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
        char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
        char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
-        for (i = 0; (i < 3); i++) {
+        for (i = 0; (i < 3); i++){ret += char_array_3[i];}
          ret += char_array_3[i];
        }
        i = 0;
      }
    }
-    if (i) {
+    if (i){
-      for (j = i; j < 4; j++) {
+      for (j = i; j < 4; j++){char_array_4[j] = 0;}
-        char_array_4[j] = 0;
+      for (j = 0; j < 4; j++){char_array_4[j] = chars.find(char_array_4[j]);}
      }
      for (j = 0; j < 4; j++) {
        char_array_4[j] = chars.find(char_array_4[j]);
      }
      char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
      char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
      char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
-      for (j = 0; (j < i - 1); j++)
+      for (j = 0; (j < i - 1); j++) ret += char_array_3[j];
        ret += char_array_3[j];
    }
    return ret;
  }
  /// Encodes a single character as two hex digits in string form.
  std::string Hex::chr(char dec){
    char dig1 = (dec & 0xF0) >> 4;
@ -103,50 +81,52 @@ namespace Encodings {
  }
  /// Decodes a hex-encoded std::string to a raw binary std::string.
-  std::string Hex::decode(const std::string & in){
+  std::string Hex::decode(const std::string &in){
-    std::string ret(in.size()/2, '\000');
+    std::string ret(in.size() / 2, '\000');
    for (size_t i = 0; i < in.size(); ++i){
      char c = in[i];
-      ret[i>>1] |= ((c&15) + (((c&64)>>6) | ((c&64)>>3))) << ((~i&1) << 2);
+      ret[i >> 1] |= ((c & 15) + (((c & 64) >> 6) | ((c & 64) >> 3))) << ((~i & 1) << 2);
    }
    return ret;
  }
  /// urlencodes std::string data, leaving only the characters A-Za-z0-9~!&()' alone.
-  std::string URL::encode(const std::string & c){
+  std::string URL::encode(const std::string &c){
    std::string escaped = "";
    int max = c.length();
-    for (int i = 0; i < max; i++) {
+    for (int i = 0; i < max; i++){
-      if (('0' <= c[i] && c[i] <= '9') || ('a' <= c[i] && c[i] <= 'z') || ('A' <= c[i] && c[i] <= 'Z')
+      if (('0' <= c[i] && c[i] <= '9') || ('a' <= c[i] && c[i] <= 'z') ||
-          || (c[i] == '~' || c[i] == '!' || c[i] == '*' || c[i] == '(' || c[i] == ')' || c[i] == '\'')) {
+          ('A' <= c[i] && c[i] <= 'Z') ||
          (c[i] == '~' || c[i] == '!' || c[i] == '*' || c[i] == '(' || c[i] == ')' || c[i] == '/' ||
           c[i] == '\'')){
        escaped.append(&c[i], 1);
-      } else {
+      }else{
-        escaped.append("%");
+        if (c[i] == ' '){
-        escaped.append(Hex::chr(c[i]));
+          escaped.append("+");
        }else{
          escaped.append("%");
          escaped.append(Hex::chr(c[i]));
        }
      }
    }
    return escaped;
  }
  /// urldecodes std::string data, parsing out both %-encoded characters and +-encoded spaces.
-  std::string URL::decode(const std::string & in){
+  std::string URL::decode(const std::string &in){
    std::string out;
-    for (unsigned int i = 0; i < in.length(); ++i) {
+    for (unsigned int i = 0; i < in.length(); ++i){
-      if (in[i] == '%') {
+      if (in[i] == '%'){
        char tmp = 0;
        ++i;
-        if (i < in.length()) {
+        if (i < in.length()){tmp = Hex::ord(in[i]) << 4;}
          tmp = Hex::ord(in[i]) << 4;
        }
        ++i;
-        if (i < in.length()) {
+        if (i < in.length()){tmp += Hex::ord(in[i]);}
          tmp += Hex::ord(in[i]);
        }
        out += tmp;
-      } else {
+      }else{
-        if (in[i] == '+') {
+        if (in[i] == '+'){
          out += ' ';
-        } else {
+        }else{
          out += in[i];
        }
      }
@ -154,5 +134,5 @@ namespace Encodings {
    return out;
  }
-}//Encodings namespace
+}// namespace Encodings
--- a/lib/http_parser.cpp
+++ b/lib/http_parser.cpp
@ -8,7 +8,7 @@
 /// Helper function to check if the given c-string is numeric or not
 static bool is_numeric(const char * str){
-  while (str != 0){
+  while (str[0] != 0){
    if (str[0] < 48 || str[0] > 57){return false;}
    ++str;
  }
@ -17,6 +17,7 @@ static bool is_numeric(const char * str){
 ///Constructor that does the actual parsing
 HTTP::URL::URL(const std::string & url){
  IPv6Addr = false;
  //first detect protocol at the start, if any
  size_t proto_sep = url.find("://");
  if (proto_sep != std::string::npos){
@ -24,6 +25,9 @@ HTTP::URL::URL(const std::string & url){
    proto_sep += 3;
  }else{
    proto_sep = 0;
    if (url.substr(0, 2) == "//"){
      proto_sep = 2;
    }
  }
  //proto_sep now points to the start of the host, guaranteed
  //continue by finding the path, if any
@ -36,7 +40,7 @@ HTTP::URL::URL(const std::string & url){
    }
    size_t hmark = path.find('#');
    if (hmark != std::string::npos){
-      frag = path.substr(hmark+1);
+      frag = Encodings::URL::decode(path.substr(hmark+1));
      path.erase(hmark);
    }
    size_t qmark = path.find('?');
@ -45,15 +49,28 @@ HTTP::URL::URL(const std::string & url){
      path.erase(qmark);
    }
    if (path.size()){
      if (path[0] == '/'){
        path.erase(0, 1);
      }
      size_t dots = path.find("/./");
      while (dots != std::string::npos){
        DONTEVEN_MSG("%s (/./ -> /)", path.c_str());
        path.erase(dots, 2);
        dots = path.find("/./");
      }
      dots = path.find("//");
      while (dots != std::string::npos){
        DONTEVEN_MSG("%s (// -> /)", path.c_str());
        path.erase(dots, 1);
        dots = path.find("//");
      }
      if (path[0] == '/'){
        path.erase(0, 1);
      }
      dots = path.find("/../");
      while (dots != std::string::npos){
        size_t prevslash = path.rfind('/', dots-1);
-        if (prevslash == std::string::npos){
+        if (prevslash == std::string::npos || dots == 0){
          path.erase(0, dots+4);
        }else{
          path.erase(prevslash+1, dots-prevslash+3);
@ -66,39 +83,59 @@ HTTP::URL::URL(const std::string & url){
      if (path.substr(0, 3) == "../"){
        path.erase(0, 3);
      }
      path = Encodings::URL::decode(path);
    }
  }
-  //host and port are now definitely between proto_sep and first_slash
+  //user, pass, host and port are now definitely between proto_sep and first_slash
-  //we check for [ at the start because we may have an IPv6 address as host
+  std::string uphp = url.substr(proto_sep, first_slash-proto_sep);//user+pass+host+port
-  if (url[proto_sep] == '['){
+  //Check if we have a user/pass before the host
-    //IPv6 address - find matching brace
+  size_t at_sign = uphp.find('@');
-    size_t closing_brace = url.find(']', proto_sep);
+  if (at_sign != std::string::npos){
-    //check if it exists at all
+    std::string creds = uphp.substr(0, at_sign);
-    if (closing_brace == std::string::npos || closing_brace > first_slash){
+    uphp.erase(0, at_sign+1);
-      //assume host ends at first slash if there is no closing brace before it
+    size_t colon = creds.find(':');
-      closing_brace = first_slash;
+    if (colon != std::string::npos){
      user = Encodings::URL::decode(creds.substr(0, colon));
      pass = Encodings::URL::decode(creds.substr(colon+1));
    }else{
      user = Encodings::URL::decode(creds);
    }
-    host = url.substr(proto_sep+1, closing_brace-(proto_sep+1));
+  }
  //we check for [ at the start because we may have an IPv6 address as host
  if (uphp[0] == '['){
    //IPv6 address - find matching brace
    IPv6Addr = true;
    size_t closing_brace = uphp.find(']');
    host = uphp.substr(1, closing_brace-1);
    //continue by finding port, if any
-    size_t colon = url.rfind(':', first_slash);
+    size_t colon = uphp.find(':', closing_brace);
-    if (colon == std::string::npos || colon <= closing_brace){
+    if (colon == std::string::npos){
-      //no port. Assume 80
+      //no port. Assume default
-      port = "80";
+      port = "";
    }else{
      //we have a port number, read it
-      port = url.substr(colon+1, first_slash-(colon+1));
+      port = uphp.substr(colon+1);
      if (!is_numeric(port.c_str())){
        host += ":" + port;
        port = "";
      }
    }
  }else{
    //"normal" host - first find port, if any
-    size_t colon = url.rfind(':', first_slash);
+    size_t colon = uphp.rfind(':');
-    if (colon == std::string::npos || colon < proto_sep){
+    if (colon == std::string::npos){
      //no port. Assume default
      port = "";
-      host = url.substr(proto_sep, first_slash-proto_sep);
+      host = uphp;
    }else{
      //we have a port number, read it
-      port = url.substr(colon+1, first_slash-(colon+1));
+      port = uphp.substr(colon+1);
-      host = url.substr(proto_sep, colon-proto_sep);
+      host = uphp.substr(0, colon);
      if (!is_numeric(port.c_str())){
        IPv6Addr = true;
        host += ":" + port;
        port = "";
      }
    }
  }
  //if the host is numeric, assume it is a port, instead
@ -121,25 +158,35 @@ uint32_t HTTP::URL::getPort() const{
 ///Returns the default port for the protocol in numeric format
 uint32_t HTTP::URL::getDefaultPort() const{
  if (protocol == "http"){return 80;}
  if (protocol == "https"){return 443;}
  if (protocol == "rtmp"){return 1935;}
  if (protocol == "dtsc"){return 4200;}
-  return 80;
+  if (protocol == "rtsp"){return 554;}
  return 0;
 }
 ///Returns the full URL in string format
 std::string HTTP::URL::getUrl() const{
  std::string ret;
  if (protocol.size()){
-    ret = protocol + "://" + host;
+    ret = protocol + "://";
  }else{
-    ret = "//" + host;
+    ret = "//";
  }
  if (user.size() || pass.size()){
    ret += Encodings::URL::encode(user) + ":" + Encodings::URL::encode(pass) + "@";
  }
  if (IPv6Addr){
    ret += "[" + host + "]";
  }else{
    ret += host;
  }
  if (port.size() && getPort() != getDefaultPort()){ret += ":" + port;}
  ret += "/";
-  if (path.size()){ret += path;}
+  if (path.size()){ret += Encodings::URL::encode(path);}
  if (args.size()){ret += "?" + args;}
-  if (frag.size()){ret += "#" + frag;}
+  if (frag.size()){ret += "#" + Encodings::URL::encode(frag);}
  return ret;
 }
@ -147,13 +194,21 @@ std::string HTTP::URL::getUrl() const{
 std::string HTTP::URL::getBareUrl() const{
  std::string ret;
  if (protocol.size()){
-    ret = protocol + "://" + host;
+    ret = protocol + "://";
  }else{
-    ret = "//" + host;
+    ret = "//";
  }
  if (user.size() || pass.size()){
    ret += Encodings::URL::encode(user) + ":" + Encodings::URL::encode(pass) + "@";
  }
  if (IPv6Addr){
    ret += "[" + host + "]";
  }else{
    ret += host;
  }
  if (port.size() && getPort() != getDefaultPort()){ret += ":" + port;}
  ret += "/";
-  if (path.size()){ret += path;}
+  if (path.size()){ret += Encodings::URL::encode(path);}
  return ret;
 }
--- a/lib/http_parser.h
+++ b/lib/http_parser.h
@ -83,7 +83,10 @@ namespace HTTP {
      std::string path;///<Path after the first slash (not inclusive) but before any question mark
      std::string args;///<Everything after the question mark in the path, if it was present
      std::string frag;///<Everything after the # in the path, if it was present
      std::string user;///<Username, if it was present
      std::string pass;///<Password, if it was present
      URL link(const std::string &l) const;
      bool IPv6Addr;
  };
 }//HTTP namespace