URL parser fixes/improvements

This commit is contained in:
Thulinma 2017-10-24 13:58:56 +02:00
parent 7d420d884e
commit 047aebdb27
3 changed files with 144 additions and 106 deletions

View file

@ -1,93 +1,71 @@
#include "encode.h" #include "encode.h"
namespace Encodings { namespace Encodings{
/// Needed for base64_encode function /// Needed for base64_encode function
const std::string Base64::chars = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; const std::string Base64::chars =
"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
/// Helper for base64_decode function /// Helper for base64_decode function
inline bool Base64::is_base64(unsigned char c) { inline bool Base64::is_base64(unsigned char c){
return (isalnum(c) || (c == '+') || (c == '/')); return (isalnum(c) || (c == '+') || (c == '/'));
} }
/// Used to base64 encode data. Input is the plaintext as std::string, output is the encoded data as std::string. /// Used to base64 encode data. Input is the plaintext as std::string, output is the encoded data
/// \param input Plaintext data to encode. /// as std::string. \param input Plaintext data to encode. \returns Base64 encoded data.
/// \returns Base64 encoded data. std::string Base64::encode(std::string const input){
std::string Base64::encode(std::string const input) {
std::string ret; std::string ret;
unsigned int in_len = input.size(); unsigned int in_len = input.size();
char quad[4], triple[3]; char quad[4], triple[3];
unsigned int i, x, n = 3; unsigned int i, x, n = 3;
for (x = 0; x < in_len; x = x + 3) { for (x = 0; x < in_len; x = x + 3){
if ((in_len - x) / 3 == 0) { if ((in_len - x) / 3 == 0){n = (in_len - x) % 3;}
n = (in_len - x) % 3; for (i = 0; i < 3; i++){triple[i] = '0';}
} for (i = 0; i < n; i++){triple[i] = input[x + i];}
for (i = 0; i < 3; i++) { quad[0] = chars[(triple[0] & 0xFC) >> 2]; // FC = 11111100
triple[i] = '0';
}
for (i = 0; i < n; i++) {
triple[i] = input[x + i];
}
quad[0] = chars[(triple[0] & 0xFC) >> 2]; // FC = 11111100
quad[1] = chars[((triple[0] & 0x03) << 4) | ((triple[1] & 0xF0) >> 4)]; // 03 = 11 quad[1] = chars[((triple[0] & 0x03) << 4) | ((triple[1] & 0xF0) >> 4)]; // 03 = 11
quad[2] = chars[((triple[1] & 0x0F) << 2) | ((triple[2] & 0xC0) >> 6)]; // 0F = 1111, C0=11110 quad[2] = chars[((triple[1] & 0x0F) << 2) | ((triple[2] & 0xC0) >> 6)]; // 0F = 1111, C0=11110
quad[3] = chars[triple[2] & 0x3F]; // 3F = 111111 quad[3] = chars[triple[2] & 0x3F]; // 3F = 111111
if (n < 3) { if (n < 3){quad[3] = '=';}
quad[3] = '='; if (n < 2){quad[2] = '=';}
} for (i = 0; i < 4; i++){ret += quad[i];}
if (n < 2) {
quad[2] = '=';
}
for (i = 0; i < 4; i++) {
ret += quad[i];
}
} }
return ret; return ret;
} //base64_encode }// base64_encode
/// Used to base64 decode data. Input is the encoded data as std::string, output is the plaintext data as std::string. /// Used to base64 decode data. Input is the encoded data as std::string, output is the plaintext
/// \param encoded_string Base64 encoded data to decode. /// data as std::string. \param encoded_string Base64 encoded data to decode. \returns Plaintext
/// \returns Plaintext decoded data. /// decoded data.
std::string Base64::decode(std::string const & encoded_string) { std::string Base64::decode(std::string const &encoded_string){
int in_len = encoded_string.size(); int in_len = encoded_string.size();
int i = 0; int i = 0;
int j = 0; int j = 0;
int in_ = 0; int in_ = 0;
unsigned char char_array_4[4], char_array_3[3]; unsigned char char_array_4[4], char_array_3[3];
std::string ret; std::string ret;
while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])) { while (in_len-- && (encoded_string[in_] != '=') && is_base64(encoded_string[in_])){
char_array_4[i++ ] = encoded_string[in_]; char_array_4[i++] = encoded_string[in_];
in_++; in_++;
if (i == 4) { if (i == 4){
for (i = 0; i < 4; i++) { for (i = 0; i < 4; i++){char_array_4[i] = chars.find(char_array_4[i]);}
char_array_4[i] = chars.find(char_array_4[i]);
}
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (i = 0; (i < 3); i++) { for (i = 0; (i < 3); i++){ret += char_array_3[i];}
ret += char_array_3[i];
}
i = 0; i = 0;
} }
} }
if (i) { if (i){
for (j = i; j < 4; j++) { for (j = i; j < 4; j++){char_array_4[j] = 0;}
char_array_4[j] = 0; for (j = 0; j < 4; j++){char_array_4[j] = chars.find(char_array_4[j]);}
}
for (j = 0; j < 4; j++) {
char_array_4[j] = chars.find(char_array_4[j]);
}
char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4); char_array_3[0] = (char_array_4[0] << 2) + ((char_array_4[1] & 0x30) >> 4);
char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2); char_array_3[1] = ((char_array_4[1] & 0xf) << 4) + ((char_array_4[2] & 0x3c) >> 2);
char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3]; char_array_3[2] = ((char_array_4[2] & 0x3) << 6) + char_array_4[3];
for (j = 0; (j < i - 1); j++) for (j = 0; (j < i - 1); j++) ret += char_array_3[j];
ret += char_array_3[j];
} }
return ret; return ret;
} }
/// Encodes a single character as two hex digits in string form. /// Encodes a single character as two hex digits in string form.
std::string Hex::chr(char dec){ std::string Hex::chr(char dec){
char dig1 = (dec & 0xF0) >> 4; char dig1 = (dec & 0xF0) >> 4;
@ -103,50 +81,52 @@ namespace Encodings {
} }
/// Decodes a hex-encoded std::string to a raw binary std::string. /// Decodes a hex-encoded std::string to a raw binary std::string.
std::string Hex::decode(const std::string & in){ std::string Hex::decode(const std::string &in){
std::string ret(in.size()/2, '\000'); std::string ret(in.size() / 2, '\000');
for (size_t i = 0; i < in.size(); ++i){ for (size_t i = 0; i < in.size(); ++i){
char c = in[i]; char c = in[i];
ret[i>>1] |= ((c&15) + (((c&64)>>6) | ((c&64)>>3))) << ((~i&1) << 2); ret[i >> 1] |= ((c & 15) + (((c & 64) >> 6) | ((c & 64) >> 3))) << ((~i & 1) << 2);
} }
return ret; return ret;
} }
/// urlencodes std::string data, leaving only the characters A-Za-z0-9~!&()' alone. /// urlencodes std::string data, leaving only the characters A-Za-z0-9~!&()' alone.
std::string URL::encode(const std::string & c){ std::string URL::encode(const std::string &c){
std::string escaped = ""; std::string escaped = "";
int max = c.length(); int max = c.length();
for (int i = 0; i < max; i++) { for (int i = 0; i < max; i++){
if (('0' <= c[i] && c[i] <= '9') || ('a' <= c[i] && c[i] <= 'z') || ('A' <= c[i] && c[i] <= 'Z') if (('0' <= c[i] && c[i] <= '9') || ('a' <= c[i] && c[i] <= 'z') ||
|| (c[i] == '~' || c[i] == '!' || c[i] == '*' || c[i] == '(' || c[i] == ')' || c[i] == '\'')) { ('A' <= c[i] && c[i] <= 'Z') ||
(c[i] == '~' || c[i] == '!' || c[i] == '*' || c[i] == '(' || c[i] == ')' || c[i] == '/' ||
c[i] == '\'')){
escaped.append(&c[i], 1); escaped.append(&c[i], 1);
} else { }else{
escaped.append("%"); if (c[i] == ' '){
escaped.append(Hex::chr(c[i])); escaped.append("+");
}else{
escaped.append("%");
escaped.append(Hex::chr(c[i]));
}
} }
} }
return escaped; return escaped;
} }
/// urldecodes std::string data, parsing out both %-encoded characters and +-encoded spaces. /// urldecodes std::string data, parsing out both %-encoded characters and +-encoded spaces.
std::string URL::decode(const std::string & in){ std::string URL::decode(const std::string &in){
std::string out; std::string out;
for (unsigned int i = 0; i < in.length(); ++i) { for (unsigned int i = 0; i < in.length(); ++i){
if (in[i] == '%') { if (in[i] == '%'){
char tmp = 0; char tmp = 0;
++i; ++i;
if (i < in.length()) { if (i < in.length()){tmp = Hex::ord(in[i]) << 4;}
tmp = Hex::ord(in[i]) << 4;
}
++i; ++i;
if (i < in.length()) { if (i < in.length()){tmp += Hex::ord(in[i]);}
tmp += Hex::ord(in[i]);
}
out += tmp; out += tmp;
} else { }else{
if (in[i] == '+') { if (in[i] == '+'){
out += ' '; out += ' ';
} else { }else{
out += in[i]; out += in[i];
} }
} }
@ -154,5 +134,5 @@ namespace Encodings {
return out; return out;
} }
}//Encodings namespace }// namespace Encodings

View file

@ -8,7 +8,7 @@
/// Helper function to check if the given c-string is numeric or not /// Helper function to check if the given c-string is numeric or not
static bool is_numeric(const char * str){ static bool is_numeric(const char * str){
while (str != 0){ while (str[0] != 0){
if (str[0] < 48 || str[0] > 57){return false;} if (str[0] < 48 || str[0] > 57){return false;}
++str; ++str;
} }
@ -17,6 +17,7 @@ static bool is_numeric(const char * str){
///Constructor that does the actual parsing ///Constructor that does the actual parsing
HTTP::URL::URL(const std::string & url){ HTTP::URL::URL(const std::string & url){
IPv6Addr = false;
//first detect protocol at the start, if any //first detect protocol at the start, if any
size_t proto_sep = url.find("://"); size_t proto_sep = url.find("://");
if (proto_sep != std::string::npos){ if (proto_sep != std::string::npos){
@ -24,6 +25,9 @@ HTTP::URL::URL(const std::string & url){
proto_sep += 3; proto_sep += 3;
}else{ }else{
proto_sep = 0; proto_sep = 0;
if (url.substr(0, 2) == "//"){
proto_sep = 2;
}
} }
//proto_sep now points to the start of the host, guaranteed //proto_sep now points to the start of the host, guaranteed
//continue by finding the path, if any //continue by finding the path, if any
@ -36,7 +40,7 @@ HTTP::URL::URL(const std::string & url){
} }
size_t hmark = path.find('#'); size_t hmark = path.find('#');
if (hmark != std::string::npos){ if (hmark != std::string::npos){
frag = path.substr(hmark+1); frag = Encodings::URL::decode(path.substr(hmark+1));
path.erase(hmark); path.erase(hmark);
} }
size_t qmark = path.find('?'); size_t qmark = path.find('?');
@ -45,15 +49,28 @@ HTTP::URL::URL(const std::string & url){
path.erase(qmark); path.erase(qmark);
} }
if (path.size()){ if (path.size()){
if (path[0] == '/'){
path.erase(0, 1);
}
size_t dots = path.find("/./"); size_t dots = path.find("/./");
while (dots != std::string::npos){ while (dots != std::string::npos){
DONTEVEN_MSG("%s (/./ -> /)", path.c_str());
path.erase(dots, 2); path.erase(dots, 2);
dots = path.find("/./"); dots = path.find("/./");
} }
dots = path.find("//");
while (dots != std::string::npos){
DONTEVEN_MSG("%s (// -> /)", path.c_str());
path.erase(dots, 1);
dots = path.find("//");
}
if (path[0] == '/'){
path.erase(0, 1);
}
dots = path.find("/../"); dots = path.find("/../");
while (dots != std::string::npos){ while (dots != std::string::npos){
size_t prevslash = path.rfind('/', dots-1); size_t prevslash = path.rfind('/', dots-1);
if (prevslash == std::string::npos){ if (prevslash == std::string::npos || dots == 0){
path.erase(0, dots+4); path.erase(0, dots+4);
}else{ }else{
path.erase(prevslash+1, dots-prevslash+3); path.erase(prevslash+1, dots-prevslash+3);
@ -66,39 +83,59 @@ HTTP::URL::URL(const std::string & url){
if (path.substr(0, 3) == "../"){ if (path.substr(0, 3) == "../"){
path.erase(0, 3); path.erase(0, 3);
} }
path = Encodings::URL::decode(path);
} }
} }
//host and port are now definitely between proto_sep and first_slash //user, pass, host and port are now definitely between proto_sep and first_slash
//we check for [ at the start because we may have an IPv6 address as host std::string uphp = url.substr(proto_sep, first_slash-proto_sep);//user+pass+host+port
if (url[proto_sep] == '['){ //Check if we have a user/pass before the host
//IPv6 address - find matching brace size_t at_sign = uphp.find('@');
size_t closing_brace = url.find(']', proto_sep); if (at_sign != std::string::npos){
//check if it exists at all std::string creds = uphp.substr(0, at_sign);
if (closing_brace == std::string::npos || closing_brace > first_slash){ uphp.erase(0, at_sign+1);
//assume host ends at first slash if there is no closing brace before it size_t colon = creds.find(':');
closing_brace = first_slash; if (colon != std::string::npos){
user = Encodings::URL::decode(creds.substr(0, colon));
pass = Encodings::URL::decode(creds.substr(colon+1));
}else{
user = Encodings::URL::decode(creds);
} }
host = url.substr(proto_sep+1, closing_brace-(proto_sep+1)); }
//we check for [ at the start because we may have an IPv6 address as host
if (uphp[0] == '['){
//IPv6 address - find matching brace
IPv6Addr = true;
size_t closing_brace = uphp.find(']');
host = uphp.substr(1, closing_brace-1);
//continue by finding port, if any //continue by finding port, if any
size_t colon = url.rfind(':', first_slash); size_t colon = uphp.find(':', closing_brace);
if (colon == std::string::npos || colon <= closing_brace){ if (colon == std::string::npos){
//no port. Assume 80 //no port. Assume default
port = "80"; port = "";
}else{ }else{
//we have a port number, read it //we have a port number, read it
port = url.substr(colon+1, first_slash-(colon+1)); port = uphp.substr(colon+1);
if (!is_numeric(port.c_str())){
host += ":" + port;
port = "";
}
} }
}else{ }else{
//"normal" host - first find port, if any //"normal" host - first find port, if any
size_t colon = url.rfind(':', first_slash); size_t colon = uphp.rfind(':');
if (colon == std::string::npos || colon < proto_sep){ if (colon == std::string::npos){
//no port. Assume default //no port. Assume default
port = ""; port = "";
host = url.substr(proto_sep, first_slash-proto_sep); host = uphp;
}else{ }else{
//we have a port number, read it //we have a port number, read it
port = url.substr(colon+1, first_slash-(colon+1)); port = uphp.substr(colon+1);
host = url.substr(proto_sep, colon-proto_sep); host = uphp.substr(0, colon);
if (!is_numeric(port.c_str())){
IPv6Addr = true;
host += ":" + port;
port = "";
}
} }
} }
//if the host is numeric, assume it is a port, instead //if the host is numeric, assume it is a port, instead
@ -121,25 +158,35 @@ uint32_t HTTP::URL::getPort() const{
///Returns the default port for the protocol in numeric format ///Returns the default port for the protocol in numeric format
uint32_t HTTP::URL::getDefaultPort() const{ uint32_t HTTP::URL::getDefaultPort() const{
if (protocol == "http"){return 80;}
if (protocol == "https"){return 443;} if (protocol == "https"){return 443;}
if (protocol == "rtmp"){return 1935;} if (protocol == "rtmp"){return 1935;}
if (protocol == "dtsc"){return 4200;} if (protocol == "dtsc"){return 4200;}
return 80; if (protocol == "rtsp"){return 554;}
return 0;
} }
///Returns the full URL in string format ///Returns the full URL in string format
std::string HTTP::URL::getUrl() const{ std::string HTTP::URL::getUrl() const{
std::string ret; std::string ret;
if (protocol.size()){ if (protocol.size()){
ret = protocol + "://" + host; ret = protocol + "://";
}else{ }else{
ret = "//" + host; ret = "//";
}
if (user.size() || pass.size()){
ret += Encodings::URL::encode(user) + ":" + Encodings::URL::encode(pass) + "@";
}
if (IPv6Addr){
ret += "[" + host + "]";
}else{
ret += host;
} }
if (port.size() && getPort() != getDefaultPort()){ret += ":" + port;} if (port.size() && getPort() != getDefaultPort()){ret += ":" + port;}
ret += "/"; ret += "/";
if (path.size()){ret += path;} if (path.size()){ret += Encodings::URL::encode(path);}
if (args.size()){ret += "?" + args;} if (args.size()){ret += "?" + args;}
if (frag.size()){ret += "#" + frag;} if (frag.size()){ret += "#" + Encodings::URL::encode(frag);}
return ret; return ret;
} }
@ -147,13 +194,21 @@ std::string HTTP::URL::getUrl() const{
std::string HTTP::URL::getBareUrl() const{ std::string HTTP::URL::getBareUrl() const{
std::string ret; std::string ret;
if (protocol.size()){ if (protocol.size()){
ret = protocol + "://" + host; ret = protocol + "://";
}else{ }else{
ret = "//" + host; ret = "//";
}
if (user.size() || pass.size()){
ret += Encodings::URL::encode(user) + ":" + Encodings::URL::encode(pass) + "@";
}
if (IPv6Addr){
ret += "[" + host + "]";
}else{
ret += host;
} }
if (port.size() && getPort() != getDefaultPort()){ret += ":" + port;} if (port.size() && getPort() != getDefaultPort()){ret += ":" + port;}
ret += "/"; ret += "/";
if (path.size()){ret += path;} if (path.size()){ret += Encodings::URL::encode(path);}
return ret; return ret;
} }

View file

@ -83,7 +83,10 @@ namespace HTTP {
std::string path;///<Path after the first slash (not inclusive) but before any question mark std::string path;///<Path after the first slash (not inclusive) but before any question mark
std::string args;///<Everything after the question mark in the path, if it was present std::string args;///<Everything after the question mark in the path, if it was present
std::string frag;///<Everything after the # in the path, if it was present std::string frag;///<Everything after the # in the path, if it was present
std::string user;///<Username, if it was present
std::string pass;///<Password, if it was present
URL link(const std::string &l) const; URL link(const std::string &l) const;
bool IPv6Addr;
}; };
}//HTTP namespace }//HTTP namespace