1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039 |
- <?php
- namespace Symfony\Component\DomCrawler;
- use Symfony\Component\CssSelector\CssSelectorConverter;
- class Crawler implements \Countable, \IteratorAggregate
- {
-
- protected $uri;
-
- private $defaultNamespacePrefix = 'default';
-
- private $namespaces = array();
-
- private $baseHref;
-
- private $document;
-
- private $nodes = array();
-
- private $isHtml = true;
-
- public function __construct($node = null, $currentUri = null, $baseHref = null)
- {
- $this->uri = $currentUri;
- $this->baseHref = $baseHref ?: $currentUri;
- $this->add($node);
- }
-
- public function clear()
- {
- $this->nodes = array();
- $this->document = null;
- }
-
- public function add($node)
- {
- if ($node instanceof \DOMNodeList) {
- $this->addNodeList($node);
- } elseif ($node instanceof \DOMNode) {
- $this->addNode($node);
- } elseif (is_array($node)) {
- $this->addNodes($node);
- } elseif (is_string($node)) {
- $this->addContent($node);
- } elseif (null !== $node) {
- throw new \InvalidArgumentException(sprintf('Expecting a DOMNodeList or DOMNode instance, an array, a string, or null, but got "%s".', is_object($node) ? get_class($node) : gettype($node)));
- }
- }
-
- public function addContent($content, $type = null)
- {
- if (empty($type)) {
- $type = 0 === strpos($content, '<?xml') ? 'application/xml' : 'text/html';
- }
-
- if (!preg_match('/(x|ht)ml/i', $type, $xmlMatches)) {
- return;
- }
- $charset = null;
- if (false !== $pos = stripos($type, 'charset=')) {
- $charset = substr($type, $pos + 8);
- if (false !== $pos = strpos($charset, ';')) {
- $charset = substr($charset, 0, $pos);
- }
- }
-
-
- if (null === $charset &&
- preg_match('/\<meta[^\>]+charset *= *["\']?([a-zA-Z\-0-9_:.]+)/i', $content, $matches)) {
- $charset = $matches[1];
- }
- if (null === $charset) {
- $charset = 'ISO-8859-1';
- }
- if ('x' === $xmlMatches[1]) {
- $this->addXmlContent($content, $charset);
- } else {
- $this->addHtmlContent($content, $charset);
- }
- }
-
- public function addHtmlContent($content, $charset = 'UTF-8')
- {
- $internalErrors = libxml_use_internal_errors(true);
- $disableEntities = libxml_disable_entity_loader(true);
- $dom = new \DOMDocument('1.0', $charset);
- $dom->validateOnParse = true;
- set_error_handler(function () {throw new \Exception();});
- try {
-
- $content = mb_convert_encoding($content, 'HTML-ENTITIES', $charset);
- } catch (\Exception $e) {
- }
- restore_error_handler();
- if ('' !== trim($content)) {
- @$dom->loadHTML($content);
- }
- libxml_use_internal_errors($internalErrors);
- libxml_disable_entity_loader($disableEntities);
- $this->addDocument($dom);
- $base = $this->filterRelativeXPath('descendant-or-self::base')->extract(array('href'));
- $baseHref = current($base);
- if (count($base) && !empty($baseHref)) {
- if ($this->baseHref) {
- $linkNode = $dom->createElement('a');
- $linkNode->setAttribute('href', $baseHref);
- $link = new Link($linkNode, $this->baseHref);
- $this->baseHref = $link->getUri();
- } else {
- $this->baseHref = $baseHref;
- }
- }
- }
-
- public function addXmlContent($content, $charset = 'UTF-8', $options = LIBXML_NONET)
- {
-
- if (!preg_match('/xmlns:/', $content)) {
- $content = str_replace('xmlns', 'ns', $content);
- }
- $internalErrors = libxml_use_internal_errors(true);
- $disableEntities = libxml_disable_entity_loader(true);
- $dom = new \DOMDocument('1.0', $charset);
- $dom->validateOnParse = true;
- if ('' !== trim($content)) {
- @$dom->loadXML($content, $options);
- }
- libxml_use_internal_errors($internalErrors);
- libxml_disable_entity_loader($disableEntities);
- $this->addDocument($dom);
- $this->isHtml = false;
- }
-
- public function addDocument(\DOMDocument $dom)
- {
- if ($dom->documentElement) {
- $this->addNode($dom->documentElement);
- }
- }
-
- public function addNodeList(\DOMNodeList $nodes)
- {
- foreach ($nodes as $node) {
- if ($node instanceof \DOMNode) {
- $this->addNode($node);
- }
- }
- }
-
- public function addNodes(array $nodes)
- {
- foreach ($nodes as $node) {
- $this->add($node);
- }
- }
-
- public function addNode(\DOMNode $node)
- {
- if ($node instanceof \DOMDocument) {
- $node = $node->documentElement;
- }
- if (null !== $this->document && $this->document !== $node->ownerDocument) {
- throw new \InvalidArgumentException('Attaching DOM nodes from multiple documents in the same crawler is forbidden.');
- }
- if (null === $this->document) {
- $this->document = $node->ownerDocument;
- }
-
- if (in_array($node, $this->nodes, true)) {
- return;
- }
- $this->nodes[] = $node;
- }
-
- public function eq($position)
- {
- if (isset($this->nodes[$position])) {
- return $this->createSubCrawler($this->nodes[$position]);
- }
- return $this->createSubCrawler(null);
- }
-
- public function each(\Closure $closure)
- {
- $data = array();
- foreach ($this->nodes as $i => $node) {
- $data[] = $closure($this->createSubCrawler($node), $i);
- }
- return $data;
- }
-
- public function slice($offset = 0, $length = null)
- {
- return $this->createSubCrawler(array_slice($this->nodes, $offset, $length));
- }
-
- public function reduce(\Closure $closure)
- {
- $nodes = array();
- foreach ($this->nodes as $i => $node) {
- if (false !== $closure($this->createSubCrawler($node), $i)) {
- $nodes[] = $node;
- }
- }
- return $this->createSubCrawler($nodes);
- }
-
- public function first()
- {
- return $this->eq(0);
- }
-
- public function last()
- {
- return $this->eq(count($this->nodes) - 1);
- }
-
- public function siblings()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- return $this->createSubCrawler($this->sibling($this->getNode(0)->parentNode->firstChild));
- }
-
- public function nextAll()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- return $this->createSubCrawler($this->sibling($this->getNode(0)));
- }
-
- public function previousAll()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- return $this->createSubCrawler($this->sibling($this->getNode(0), 'previousSibling'));
- }
-
- public function parents()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- $node = $this->getNode(0);
- $nodes = array();
- while ($node = $node->parentNode) {
- if (XML_ELEMENT_NODE === $node->nodeType) {
- $nodes[] = $node;
- }
- }
- return $this->createSubCrawler($nodes);
- }
-
- public function children()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- $node = $this->getNode(0)->firstChild;
- return $this->createSubCrawler($node ? $this->sibling($node) : array());
- }
-
- public function attr($attribute)
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- $node = $this->getNode(0);
- return $node->hasAttribute($attribute) ? $node->getAttribute($attribute) : null;
- }
-
- public function nodeName()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- return $this->getNode(0)->nodeName;
- }
-
- public function text()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- return $this->getNode(0)->nodeValue;
- }
-
- public function html()
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- $html = '';
- foreach ($this->getNode(0)->childNodes as $child) {
- $html .= $child->ownerDocument->saveHTML($child);
- }
- return $html;
- }
-
- public function extract($attributes)
- {
- $attributes = (array) $attributes;
- $count = count($attributes);
- $data = array();
- foreach ($this->nodes as $node) {
- $elements = array();
- foreach ($attributes as $attribute) {
- if ('_text' === $attribute) {
- $elements[] = $node->nodeValue;
- } else {
- $elements[] = $node->getAttribute($attribute);
- }
- }
- $data[] = $count > 1 ? $elements : $elements[0];
- }
- return $data;
- }
-
- public function filterXPath($xpath)
- {
- $xpath = $this->relativize($xpath);
-
- if ('' === $xpath) {
- return $this->createSubCrawler(null);
- }
- return $this->filterRelativeXPath($xpath);
- }
-
- public function filter($selector)
- {
- if (!class_exists('Symfony\\Component\\CssSelector\\CssSelectorConverter')) {
- throw new \RuntimeException('Unable to filter with a CSS selector as the Symfony CssSelector 2.8+ is not installed (you can use filterXPath instead).');
- }
- $converter = new CssSelectorConverter($this->isHtml);
-
- return $this->filterRelativeXPath($converter->toXPath($selector));
- }
-
- public function selectLink($value)
- {
- $xpath = sprintf('descendant-or-self::a[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) ', static::xpathLiteral(' '.$value.' ')).
- sprintf('or ./img[contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)]]', static::xpathLiteral(' '.$value.' '));
- return $this->filterRelativeXPath($xpath);
- }
-
- public function selectButton($value)
- {
- $translate = 'translate(@type, "ABCDEFGHIJKLMNOPQRSTUVWXYZ", "abcdefghijklmnopqrstuvwxyz")';
- $xpath = sprintf('descendant-or-self::input[((contains(%s, "submit") or contains(%s, "button")) and contains(concat(\' \', normalize-space(string(@value)), \' \'), %s)) ', $translate, $translate, static::xpathLiteral(' '.$value.' ')).
- sprintf('or (contains(%s, "image") and contains(concat(\' \', normalize-space(string(@alt)), \' \'), %s)) or @id=%s or @name=%s] ', $translate, static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value)).
- sprintf('| descendant-or-self::button[contains(concat(\' \', normalize-space(string(.)), \' \'), %s) or @id=%s or @name=%s]', static::xpathLiteral(' '.$value.' '), static::xpathLiteral($value), static::xpathLiteral($value));
- return $this->filterRelativeXPath($xpath);
- }
-
- public function link($method = 'get')
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- $node = $this->getNode(0);
- if (!$node instanceof \DOMElement) {
- throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
- }
- return new Link($node, $this->baseHref, $method);
- }
-
- public function links()
- {
- $links = array();
- foreach ($this->nodes as $node) {
- if (!$node instanceof \DOMElement) {
- throw new \InvalidArgumentException(sprintf('The current node list should contain only DOMElement instances, "%s" found.', get_class($node)));
- }
- $links[] = new Link($node, $this->baseHref, 'get');
- }
- return $links;
- }
-
- public function form(array $values = null, $method = null)
- {
- if (!$this->nodes) {
- throw new \InvalidArgumentException('The current node list is empty.');
- }
- $node = $this->getNode(0);
- if (!$node instanceof \DOMElement) {
- throw new \InvalidArgumentException(sprintf('The selected node should be instance of DOMElement, got "%s".', get_class($node)));
- }
- $form = new Form($node, $this->uri, $method, $this->baseHref);
- if (null !== $values) {
- $form->setValues($values);
- }
- return $form;
- }
-
- public function setDefaultNamespacePrefix($prefix)
- {
- $this->defaultNamespacePrefix = $prefix;
- }
-
- public function registerNamespace($prefix, $namespace)
- {
- $this->namespaces[$prefix] = $namespace;
- }
-
- public static function xpathLiteral($s)
- {
- if (false === strpos($s, "'")) {
- return sprintf("'%s'", $s);
- }
- if (false === strpos($s, '"')) {
- return sprintf('"%s"', $s);
- }
- $string = $s;
- $parts = array();
- while (true) {
- if (false !== $pos = strpos($string, "'")) {
- $parts[] = sprintf("'%s'", substr($string, 0, $pos));
- $parts[] = "\"'\"";
- $string = substr($string, $pos + 1);
- } else {
- $parts[] = "'$string'";
- break;
- }
- }
- return sprintf('concat(%s)', implode($parts, ', '));
- }
-
- private function filterRelativeXPath($xpath)
- {
- $prefixes = $this->findNamespacePrefixes($xpath);
- $crawler = $this->createSubCrawler(null);
- foreach ($this->nodes as $node) {
- $domxpath = $this->createDOMXPath($node->ownerDocument, $prefixes);
- $crawler->add($domxpath->query($xpath, $node));
- }
- return $crawler;
- }
-
- private function relativize($xpath)
- {
- $expressions = array();
- $unionPattern = '/\|(?![^\[]*\])/';
-
-
- $nonMatchingExpression = 'a[name() = "b"]';
-
- foreach (preg_split($unionPattern, $xpath) as $expression) {
- $expression = trim($expression);
- $parenthesis = '';
-
-
- if (preg_match('/^[\(\s*]+/', $expression, $matches)) {
- $parenthesis = $matches[0];
- $expression = substr($expression, strlen($parenthesis));
- }
- if (0 === strpos($expression, 'self::*/')) {
- $expression = './'.substr($expression, 8);
- }
-
- if (empty($expression)) {
- $expression = $nonMatchingExpression;
- } elseif (0 === strpos($expression, '//')) {
- $expression = 'descendant-or-self::'.substr($expression, 2);
- } elseif (0 === strpos($expression, './/')) {
- $expression = 'descendant-or-self::'.substr($expression, 3);
- } elseif (0 === strpos($expression, './')) {
- $expression = 'self::'.substr($expression, 2);
- } elseif (0 === strpos($expression, 'child::')) {
- $expression = 'self::'.substr($expression, 7);
- } elseif ('/' === $expression[0] || '.' === $expression[0] || 0 === strpos($expression, 'self::')) {
- $expression = $nonMatchingExpression;
- } elseif (0 === strpos($expression, 'descendant::')) {
- $expression = 'descendant-or-self::'.substr($expression, strlen('descendant::'));
- } elseif (preg_match('/^(ancestor|ancestor-or-self|attribute|following|following-sibling|namespace|parent|preceding|preceding-sibling)::/', $expression)) {
-
- $expression = $nonMatchingExpression;
- } elseif (0 !== strpos($expression, 'descendant-or-self::')) {
- $expression = 'self::'.$expression;
- }
- $expressions[] = $parenthesis.$expression;
- }
- return implode(' | ', $expressions);
- }
-
- public function getNode($position)
- {
- if (isset($this->nodes[$position])) {
- return $this->nodes[$position];
- }
- }
-
- public function count()
- {
- return count($this->nodes);
- }
-
- public function getIterator()
- {
- return new \ArrayIterator($this->nodes);
- }
-
- protected function sibling($node, $siblingDir = 'nextSibling')
- {
- $nodes = array();
- do {
- if ($node !== $this->getNode(0) && $node->nodeType === 1) {
- $nodes[] = $node;
- }
- } while ($node = $node->$siblingDir);
- return $nodes;
- }
-
- private function createDOMXPath(\DOMDocument $document, array $prefixes = array())
- {
- $domxpath = new \DOMXPath($document);
- foreach ($prefixes as $prefix) {
- $namespace = $this->discoverNamespace($domxpath, $prefix);
- if (null !== $namespace) {
- $domxpath->registerNamespace($prefix, $namespace);
- }
- }
- return $domxpath;
- }
-
- private function discoverNamespace(\DOMXPath $domxpath, $prefix)
- {
- if (isset($this->namespaces[$prefix])) {
- return $this->namespaces[$prefix];
- }
-
- $namespaces = $domxpath->query(sprintf('(//namespace::*[name()="%s"])[last()]', $this->defaultNamespacePrefix === $prefix ? '' : $prefix));
- if ($node = $namespaces->item(0)) {
- return $node->nodeValue;
- }
- }
-
- private function findNamespacePrefixes($xpath)
- {
- if (preg_match_all('/(?P<prefix>[a-z_][a-z_0-9\-\.]*+):[^"\/:]/i', $xpath, $matches)) {
- return array_unique($matches['prefix']);
- }
- return array();
- }
-
- private function createSubCrawler($nodes)
- {
- $crawler = new static($nodes, $this->uri, $this->baseHref);
- $crawler->isHtml = $this->isHtml;
- $crawler->document = $this->document;
- $crawler->namespaces = $this->namespaces;
- return $crawler;
- }
- }
|