public class ExtractorHTML extends ContentExtractor implements org.springframework.beans.factory.InitializingBean
Modifier and Type | Field and Description |
---|---|
static String |
A_FORM_OFFSETS |
static String |
A_META_ROBOTS |
protected ExtractorJS |
extractorJS
Javascript extractor to use to process inline javascript.
|
protected CrawlMetadata |
metadata
CrawlMetadata provides the robots honoring policy to use when
considering a robots META tag.
|
DEFAULT_PARAMETERS, extractorParameters, loggerModule, numberOfLinksExtracted
Constructor and Description |
---|
ExtractorHTML() |
Modifier and Type | Method and Description |
---|---|
protected void |
addLinkFromString(CrawlURI curi,
CharSequence uri,
CharSequence context,
Hop hop) |
void |
afterPropertiesSet() |
protected void |
considerIfLikelyUri(CrawlURI curi,
CharSequence candidate,
CharSequence valueContext,
Hop hop)
Consider whether a given string is URI-like.
|
protected void |
considerQueryStringValues(CrawlURI curi,
CharSequence queryString,
CharSequence valueContext,
Hop hop)
Consider a query-string-like collections of key=value[&key=value]
pairs for URI-like strings in the values.
|
static CharSequence |
elementContext(CharSequence element,
CharSequence attribute)
Create a suitable XPath-like context from an element name and optional
attribute name.
|
protected void |
extract(CrawlURI curi,
CharSequence cs)
Run extractor.
|
protected Charset |
getContentDeclaredCharset(CrawlURI curi,
String contentPrefix) |
boolean |
getExtractJavascript() |
boolean |
getExtractOnlyFormGets() |
ExtractorJS |
getExtractorJS() |
boolean |
getExtractValueAttributes() |
boolean |
getIgnoreFormActionUrls() |
boolean |
getIgnoreUnexpectedHtml() |
int |
getMaxAttributeNameLength() |
int |
getMaxAttributeValLength() |
int |
getMaxElementLength() |
CrawlMetadata |
getMetadata() |
boolean |
getTreatFramesAsEmbedLinks() |
boolean |
innerExtract(CrawlURI curi)
Actually extracts links.
|
protected boolean |
isHtmlExpectedHere(CrawlURI curi)
Test whether this HTML is so unexpected (eg in place of a GIF URI)
that it shouldn't be scanned for links.
|
protected void |
processEmbed(CrawlURI curi,
CharSequence value,
CharSequence context) |
protected void |
processEmbed(CrawlURI curi,
CharSequence value,
CharSequence context,
Hop hop) |
protected void |
processGeneralTag(CrawlURI curi,
CharSequence element,
CharSequence cs) |
protected void |
processLink(CrawlURI curi,
CharSequence value,
CharSequence context)
Handle generic HREF cases.
|
protected boolean |
processMeta(CrawlURI curi,
CharSequence cs)
Process metadata tags.
|
protected void |
processScript(CrawlURI curi,
CharSequence sequence,
int endOfOpenTag) |
protected void |
processScriptCode(CrawlURI curi,
CharSequence cs)
Extract the (java)script source in the given CharSequence.
|
protected void |
processStyle(CrawlURI curi,
CharSequence sequence,
int endOfOpenTag)
Process style text.
|
void |
setExtractJavascript(boolean extractJavascript) |
void |
setExtractOnlyFormGets(boolean onlyGets) |
void |
setExtractorJS(ExtractorJS extractorJS) |
void |
setExtractValueAttributes(boolean extractValueAttributes) |
void |
setIgnoreFormActionUrls(boolean ignoreActions) |
void |
setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml) |
void |
setMaxAttributeNameLength(int max) |
void |
setMaxAttributeValLength(int max) |
void |
setMaxElementLength(int max) |
void |
setMetadata(CrawlMetadata provider) |
void |
setTreatFramesAsEmbedLinks(boolean asEmbeds) |
protected boolean |
shouldExtract(CrawlURI uri)
Determines if otherwise valid URIs should have links extracted or not.
|
extract, shouldProcess
addOutlink, fromCheckpointJson, getExtractorParameters, getLoggerModule, innerProcess, logUriError, report, setExtractorParameters, setLoggerModule, toCheckpointJson
doCheckpoint, finishCheckpoint, flattenVia, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, innerProcessResult, innerRejectProcess, isRunning, isSuccess, process, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, start, startCheckpoint, stop
public static final String A_META_ROBOTS
public static final String A_FORM_OFFSETS
protected CrawlMetadata metadata
protected transient ExtractorJS extractorJS
public int getMaxElementLength()
public void setMaxElementLength(int max)
public int getMaxAttributeNameLength()
public void setMaxAttributeNameLength(int max)
public int getMaxAttributeValLength()
public void setMaxAttributeValLength(int max)
public boolean getTreatFramesAsEmbedLinks()
public void setTreatFramesAsEmbedLinks(boolean asEmbeds)
public boolean getIgnoreFormActionUrls()
public void setIgnoreFormActionUrls(boolean ignoreActions)
public boolean getExtractOnlyFormGets()
public void setExtractOnlyFormGets(boolean onlyGets)
public boolean getExtractJavascript()
public void setExtractJavascript(boolean extractJavascript)
public boolean getExtractValueAttributes()
public void setExtractValueAttributes(boolean extractValueAttributes)
public boolean getIgnoreUnexpectedHtml()
public void setIgnoreUnexpectedHtml(boolean ignoreUnexpectedHtml)
public CrawlMetadata getMetadata()
public void setMetadata(CrawlMetadata provider)
public ExtractorJS getExtractorJS()
public void setExtractorJS(ExtractorJS extractorJS)
public void afterPropertiesSet()
afterPropertiesSet
in interface org.springframework.beans.factory.InitializingBean
protected void processGeneralTag(CrawlURI curi, CharSequence element, CharSequence cs)
protected void considerQueryStringValues(CrawlURI curi, CharSequence queryString, CharSequence valueContext, Hop hop)
curi
- origin CrawlURIqueryString
- query-string-like stringvalueContext
- page context where foundprotected void considerIfLikelyUri(CrawlURI curi, CharSequence candidate, CharSequence valueContext, Hop hop)
curi
- origin CrawlURIqueryString
- query-string-like stringvalueContext
- page context where foundprotected void processScriptCode(CrawlURI curi, CharSequence cs)
curi
- source CrawlURIcs
- CharSequence of javascript codeprotected void processLink(CrawlURI curi, CharSequence value, CharSequence context)
curi
- value
- context
- protected void addLinkFromString(CrawlURI curi, CharSequence uri, CharSequence context, Hop hop)
protected final void processEmbed(CrawlURI curi, CharSequence value, CharSequence context)
protected void processEmbed(CrawlURI curi, CharSequence value, CharSequence context, Hop hop)
protected boolean shouldExtract(CrawlURI uri)
ContentExtractor
ExtractorHTML
implementation checks that the content-type of
the given URI is text/html.shouldExtract
in class ContentExtractor
uri
- the URI to checkpublic boolean innerExtract(CrawlURI curi)
ContentExtractor
#shouldProcess(ExtractorURI)
. Subclasses
should implement this method to discover outlinks in the URI's
content stream. For instance, ExtractorHTML
extracts links
from Anchor tags and so on.
This method should only return true if extraction completed successfully. If not (for instance, if an IO error occurred), then this method should return false. Returning false indicates to the pipeline that downstream extractors should attempt to extract links themselves. Returning true indicates that downstream extractors should be skipped.
innerExtract
in class ContentExtractor
curi
- the URI whose links to extractprotected Charset getContentDeclaredCharset(CrawlURI curi, String contentPrefix)
protected void extract(CrawlURI curi, CharSequence cs)
curi
- CrawlURI we're processing.cs
- Sequence from underlying ReplayCharSequence. This
is TRANSIENT data. Make a copy if you want the data to live outside
of this extractors' lifetime.protected boolean isHtmlExpectedHere(CrawlURI curi) throws org.apache.commons.httpclient.URIException
curi
- CrawlURI to examine.org.apache.commons.httpclient.URIException
protected void processScript(CrawlURI curi, CharSequence sequence, int endOfOpenTag)
protected boolean processMeta(CrawlURI curi, CharSequence cs)
curi
- CrawlURI we're processing.cs
- Sequence from underlying ReplayCharSequence. This
is TRANSIENT data. Make a copy if you want the data to live outside
of this extractors' lifetime.protected void processStyle(CrawlURI curi, CharSequence sequence, int endOfOpenTag)
curi
- CrawlURI we're processing.sequence
- Sequence from underlying ReplayCharSequence. This
is TRANSIENT data. Make a copy if you want the data to live outside
of this extractors' lifetime.endOfOpenTag
- public static CharSequence elementContext(CharSequence element, CharSequence attribute)
element
- attribute
- Copyright © 2003-2014 Internet Archive. All Rights Reserved.