public class FetchHTTP extends Processor implements org.springframework.context.Lifecycle
Modifier and Type | Field and Description |
---|---|
protected CookieStorage |
cookieStorage |
protected String |
digestAlgorithm
Which algorithm (for example MD5 or SHA-1) to use to perform an
on-the-fly digest hash of retrieved content-bodies.
|
static String |
HTTP_BIND_ADDRESS |
static String |
HTTP_SCHEME |
static String |
HTTPS_SCHEME |
static String |
RANGE |
static String |
RANGE_PREFIX |
static String |
REFERER |
protected ServerCache |
serverCache
Used to do DNS lookups.
|
Constructor and Description |
---|
FetchHTTP()
Constructor.
|
Modifier and Type | Method and Description |
---|---|
protected void |
addResponseContent(org.apache.commons.httpclient.HttpMethod method,
CrawlURI curi)
This method populates
curi with response status and
content type. |
protected boolean |
checkMidfetchAbort(CrawlURI curi,
org.archive.httpclient.HttpRecorderMethod method,
HttpConnection conn) |
protected void |
cleanupHttp()
Perform any final cleanup related to the HttpClient instance.
|
protected void |
configureHttp() |
protected void |
configureHttp(int soTimeout,
String addressStr,
String proxy,
int port,
String user,
String password) |
protected org.apache.commons.httpclient.HostConfiguration |
configureMethod(CrawlURI curi,
org.apache.commons.httpclient.HttpMethod method)
Configure the HttpMethod setting options and headers.
|
protected void |
doAbort(CrawlURI curi,
org.apache.commons.httpclient.HttpMethod method,
String annotation) |
boolean |
getAcceptCompression() |
List<String> |
getAcceptHeaders() |
protected Object |
getAttributeEither(CrawlURI curi,
String key)
Get a value either from inside the CrawlURI instance, or from
settings (module attributes).
|
protected org.apache.commons.httpclient.auth.AuthScheme |
getAuthScheme(org.apache.commons.httpclient.HttpMethod method,
CrawlURI curi) |
CookieStorage |
getCookieStorage() |
CredentialStore |
getCredentialStore() |
Charset |
getDefaultCharset() |
String |
getDefaultEncoding() |
String |
getDigestAlgorithm() |
boolean |
getDigestContent() |
protected org.apache.commons.httpclient.HttpClient |
getHttp() |
String |
getHttpBindAddress()
Local IP address or hostname to use when making connections (binding
sockets).
|
String |
getHttpProxyHost() |
String |
getHttpProxyPassword() |
int |
getHttpProxyPort() |
String |
getHttpProxyUser() |
boolean |
getIgnoreCookies() |
int |
getMaxFetchKBSec() |
long |
getMaxLengthBytes() |
boolean |
getSendConnectionClose() |
boolean |
getSendIfModifiedSince() |
boolean |
getSendIfNoneMatch() |
boolean |
getSendRange() |
boolean |
getSendReferer() |
ServerCache |
getServerCache() |
DecideRule |
getShouldFetchBodyRule() |
int |
getSoTimeoutMs() |
org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel |
getSslTrustLevel() |
int |
getTimeoutSeconds() |
boolean |
getUseHTTP11() |
UserAgentProvider |
getUserAgentProvider() |
protected void |
handle401(org.apache.commons.httpclient.HttpMethod method,
CrawlURI curi)
Server is looking for basic/digest auth credentials (RFC2617).
|
protected void |
innerProcess(CrawlURI curi)
Actually performs the process.
|
boolean |
isRunning() |
ProcessResult |
process(CrawlURI uri)
Processes the given URI.
|
String |
report() |
void |
setAcceptCompression(boolean acceptCompression) |
void |
setAcceptHeaders(List<String> headers) |
protected void |
setConditionalGetHeader(CrawlURI curi,
org.apache.commons.httpclient.HttpMethod method,
boolean conditional,
String sourceHeader,
String targetHeader)
Set the given conditional-GET header, if the setting is enabled and
a suitable value is available in the URI history.
|
void |
setCookieStorage(CookieStorage storage) |
void |
setCredentialStore(CredentialStore credentials) |
void |
setDefaultEncoding(String encoding) |
void |
setDigestAlgorithm(String digestAlgorithm) |
void |
setDigestContent(boolean digest) |
void |
setHttpBindAddress(String address) |
void |
setHttpProxyHost(String host) |
void |
setHttpProxyPassword(String password) |
void |
setHttpProxyPort(int port) |
void |
setHttpProxyUser(String user) |
void |
setIgnoreCookies(boolean ignoreCookies) |
void |
setMaxFetchKBSec(int rate) |
void |
setMaxLengthBytes(long timeout) |
void |
setSendConnectionClose(boolean sendClose) |
void |
setSendIfModifiedSince(boolean sendIfModifiedSince) |
void |
setSendIfNoneMatch(boolean sendIfNoneMatch) |
void |
setSendRange(boolean sendRange) |
void |
setSendReferer(boolean sendClose) |
void |
setServerCache(ServerCache serverCache) |
void |
setShouldFetchBodyRule(DecideRule rule) |
protected void |
setSizes(CrawlURI curi,
org.archive.util.Recorder rec)
Update CrawlURI internal sizes based on current transaction (and
in the case of 304s, history)
|
void |
setSoTimeoutMs(int timeout) |
void |
setSslTrustLevel(org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel trustLevel) |
void |
setTimeoutSeconds(int timeout) |
void |
setUseHTTP11(boolean useHTTP11) |
void |
setUserAgentProvider(UserAgentProvider provider) |
protected boolean |
shouldProcess(CrawlURI curi)
Can this processor fetch the given CrawlURI.
|
void |
start() |
void |
stop() |
doCheckpoint, finishCheckpoint, flattenVia, fromCheckpointJson, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, innerProcessResult, innerRejectProcess, isSuccess, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, startCheckpoint, toCheckpointJson
protected String digestAlgorithm
public static final String REFERER
public static final String RANGE
public static final String RANGE_PREFIX
public static final String HTTP_SCHEME
public static final String HTTPS_SCHEME
protected CookieStorage cookieStorage
public static final String HTTP_BIND_ADDRESS
protected ServerCache serverCache
public String getHttpProxyHost()
public void setHttpProxyHost(String host)
public int getHttpProxyPort()
public void setHttpProxyPort(int port)
public String getHttpProxyUser()
public void setHttpProxyUser(String user)
public String getHttpProxyPassword()
public void setHttpProxyPassword(String password)
public int getTimeoutSeconds()
public void setTimeoutSeconds(int timeout)
public int getSoTimeoutMs()
public void setSoTimeoutMs(int timeout)
public long getMaxLengthBytes()
public void setMaxLengthBytes(long timeout)
public String getDefaultEncoding()
public void setDefaultEncoding(String encoding)
public Charset getDefaultCharset()
public boolean getDigestContent()
public void setDigestContent(boolean digest)
public String getDigestAlgorithm()
public void setDigestAlgorithm(String digestAlgorithm)
public int getMaxFetchKBSec()
public void setMaxFetchKBSec(int rate)
public UserAgentProvider getUserAgentProvider()
public void setUserAgentProvider(UserAgentProvider provider)
public org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel getSslTrustLevel()
public void setSslTrustLevel(org.archive.httpclient.ConfigurableX509TrustManager.TrustLevel trustLevel)
public DecideRule getShouldFetchBodyRule()
public void setShouldFetchBodyRule(DecideRule rule)
public boolean getUseHTTP11()
public void setUseHTTP11(boolean useHTTP11)
public boolean getAcceptCompression()
public void setAcceptCompression(boolean acceptCompression)
public boolean getSendConnectionClose()
public void setSendConnectionClose(boolean sendClose)
public boolean getSendReferer()
public void setSendReferer(boolean sendClose)
public boolean getSendRange()
public void setSendRange(boolean sendRange)
public boolean getSendIfModifiedSince()
public void setSendIfModifiedSince(boolean sendIfModifiedSince)
public boolean getSendIfNoneMatch()
public void setSendIfNoneMatch(boolean sendIfNoneMatch)
public void setCookieStorage(CookieStorage storage)
public CookieStorage getCookieStorage()
public boolean getIgnoreCookies()
public void setIgnoreCookies(boolean ignoreCookies)
public String getHttpBindAddress()
public void setHttpBindAddress(String address)
public CredentialStore getCredentialStore()
public void setCredentialStore(CredentialStore credentials)
public ServerCache getServerCache()
public void setServerCache(ServerCache serverCache)
protected void innerProcess(CrawlURI curi) throws InterruptedException
Processor
#ENABLED
, the
#DECIDE_RULES
and the #shouldProcess(ProcessorURI)
tests.innerProcess
in class Processor
curi
- the URI to processInterruptedException
- if the thread is interruptedprotected void setSizes(CrawlURI curi, org.archive.util.Recorder rec)
curi
- CrawlURIrec
- HttpRecorderprotected void doAbort(CrawlURI curi, org.apache.commons.httpclient.HttpMethod method, String annotation)
protected boolean checkMidfetchAbort(CrawlURI curi, org.archive.httpclient.HttpRecorderMethod method, HttpConnection conn)
protected void addResponseContent(org.apache.commons.httpclient.HttpMethod method, CrawlURI curi)
curi
with response status and
content type.curi
- CrawlURI to populate.method
- Method to get response status and headers from.public ProcessResult process(CrawlURI uri) throws InterruptedException
Processor
#ENABLED
and
#DECIDE_RULES
. If ENABLED is false, then nothing happens.
If the DECIDE_RULES indicate REJECT, then the
#innerRejectProcess(ProcessorURI)
method is invoked, and
the process method returns.
Next, the #shouldProcess(ProcessorURI)
method is
consulted to see if this Processor knows how to handle the given
URI. If it returns false, then nothing futher occurs.
FIXME: Should innerRejectProcess be called when ENABLED is false, or when shouldProcess returns false? The previous Processor implementation didn't handle it that way.
Otherwise, the URI is considered valid. This processor's count
of handled URIs is incremented, and the
#innerProcess(ProcessorURI)
method is invoked to actually
perform the process.
process
in class Processor
uri
- The URI to processInterruptedException
- if the thread is interruptedprotected boolean shouldProcess(CrawlURI curi)
shouldProcess
in class Processor
curi
- protected org.apache.commons.httpclient.HostConfiguration configureMethod(CrawlURI curi, org.apache.commons.httpclient.HttpMethod method)
curi
- CrawlURI from which we pull configuration.method
- The Method to configure.protected void setConditionalGetHeader(CrawlURI curi, org.apache.commons.httpclient.HttpMethod method, boolean conditional, String sourceHeader, String targetHeader)
curi
- source CrawlURImethod
- HTTP operation pendingsetting
- true/false enablement setting name to consultsourceHeader
- header to consult in URI historytargetHeader
- header to set if possibleprotected Object getAttributeEither(CrawlURI curi, String key)
curi
- CrawlURI to consultkey
- key to lookupprotected void handle401(org.apache.commons.httpclient.HttpMethod method, CrawlURI curi)
method
- Method that got a 401.curi
- CrawlURI that got a 401.protected org.apache.commons.httpclient.auth.AuthScheme getAuthScheme(org.apache.commons.httpclient.HttpMethod method, CrawlURI curi)
method
- Method that got a 401.curi
- CrawlURI that got a 401.public void start()
public boolean isRunning()
public void stop()
protected void cleanupHttp()
protected void configureHttp()
protected void configureHttp(int soTimeout, String addressStr, String proxy, int port, String user, String password)
protected org.apache.commons.httpclient.HttpClient getHttp()
Copyright © 2003-2014 Internet Archive. All Rights Reserved.