URL相对路径转绝对路径
在做爬虫或者网页解析时,经常会从网页中提取到很多相对URL,在做进一步爬取之前,我们需要将这个地址转成完整的URL,其实URL和文件路径是相似的,看似很简单,实则有很多情况需要考虑,网上提供的很多方法都会有各种各样的问题,这里提供了一个相对靠谱的方法,代码如下。
echo PhpUri::doIt("https://www.google.com/", "foo");
//输出 https://www.google.com/foo
class PhpUri
{
public $scheme;
public $authority;
public $path;
public $query;
public $fragment;
/**
* 调用这个方法进行转换
*
* @param $baseUrl
* @param $relativeUrl
* @return string
*/
public static function doIt($baseUrl, $relativeUrl)
{
return self::parse($baseUrl)->join($relativeUrl);
}
/**
* PhpUri constructor.
*/
public function __construct($url)
{
preg_match_all( '/^(([^:\/?#]+):)?(\/\/([^\/?#]*))?([^?#]*)(\?([^#]*))?(#(.*))?$/',
$url, $matches);
$this->scheme = $matches[2][0];
$this->authority = $matches[4][0];
$this->path = $matches[5][0];
$this->query = $matches[7][0];
$this->fragment = $matches[9][0];
}
public static function parse($url)
{
$uri = new PhpUri($url);
if(empty($uri->path))
$uri->path = '/';
return $uri;
}
public function join($relative)
{
$uri = new PhpUri($relative);
switch (true)
{
case !empty($uri->scheme):
break;
case !empty($uri->authority):
break;
case empty($uri->path):
$uri->path = $this->path;
if (empty($uri->query))
{
$uri->query = $this->query;
}
break;
case strpos($uri->path, '/')===0:
break;
default:
$base_path = $this->path;
if (strpos( $base_path, '/' ) === FALSE)
{
$base_path = '';
}
else
{
$base_path = preg_replace('/\/[^\/]+$/', '/', $base_path);
}
if (empty($base_path) && empty($this->authority))
{
$base_path = '/';
}
$uri->path = $base_path.$uri->path;
}
if (empty($uri->scheme))
{
$uri->scheme = $this->scheme;
if (empty($uri->authority ))
{
$uri->authority = $this->authority;
}
}
return $uri->toStr();
}
private function toStr()
{
$ret = '';
if ( !empty( $this->scheme ) )
{
$ret .= "{$this->scheme}:";
}
if ( !empty( $this->authority ) )
{
$ret .= "//{$this->authority}";
}
$ret .= $this->normalizePath( $this->path );
if ( !empty( $this->query ) )
{
$ret .= "?{$this->query}";
}
if ( !empty( $this->fragment ) )
{
$ret .= "#{$this->fragment}";
}
return $ret;
}
private function normalizePath( $path )
{
if ( empty( $path ) )
{
return '';
}
$normalized_path = $path;
$normalized_path = preg_replace( '`//+`', '/', $normalized_path, -1, $c0 );
$normalized_path = preg_replace( '`^/\\.\\.?/`', '/', $normalized_path, -1, $c1 );
$normalized_path = preg_replace( '`/\\.(/|$)`', '/', $normalized_path, -1, $c2 );
/**
* CHANGE:
* @author Dominik Habichtsberg <Dominik.Habichtsberg@Hbg-IT.de>
* @since 24 Mai 2015 10:05 Uhr
* changed limit form -1 to 1, because climbing up the directory-tree failed
*/
$normalized_path = preg_replace( '`/[^/]*?/\\.\\.(/|$)`', '/', $normalized_path, 1, $c3 );
$num_matches = $c0 + $c1 + $c2 + $c3;
return ( $num_matches > 0 ) ? $this->normalizePath( $normalized_path ) : $normalized_path;
}
}