public class WARCWriterProcessor extends WriterPoolProcessor implements org.archive.io.warc.WARCWriterPoolSettings
TODO: Remove ANVLRecord. Rename NameValue or use RFC822 (commons-httpclient?) or find something else.
Modifier and Type | Field and Description |
---|---|
protected org.archive.uid.RecordIDGenerator |
generator
Generator for record IDs
|
ANNOTATION_UNWRITTEN, compress, directory, frequentFlushes, maxFileSizeBytes, maxTotalBytesToWrite, maxWaitForIdleMs, poolMaxActive, prefix, serverCache, skipIdenticalDigests, startNewFilesOnCheckpoint, storePaths, template, writeBufferSize
Constructor and Description |
---|
WARCWriterProcessor() |
Modifier and Type | Method and Description |
---|---|
protected void |
addIfNotBlank(org.archive.util.anvl.ANVLRecord record,
String label,
String value) |
protected void |
addStats(Map<String,Map<String,Long>> substats) |
protected void |
fromCheckpointJson(org.json.JSONObject json)
Restore internal state from JSONObject stored at earlier
checkpoint-time.
|
long |
getDefaultMaxFileSize() |
List<ConfigPath> |
getDefaultStorePaths() |
List<String> |
getMetadata() |
protected URI |
getRecordID() |
org.archive.uid.RecordIDGenerator |
getRecordIDGenerator() |
boolean |
getWriteMetadata() |
boolean |
getWriteRequests() |
boolean |
getWriteRevisitForIdenticalDigests() |
boolean |
getWriteRevisitForNotModified() |
protected ProcessResult |
innerProcessResult(CrawlURI puri)
Writes a CrawlURI and its associated data to store file.
|
protected URI |
qualifyRecordID(URI base,
String key,
String value) |
String |
report() |
protected void |
saveHeader(String origName,
org.apache.commons.httpclient.HttpMethod method,
org.archive.util.anvl.ANVLRecord headers,
String newName)
Save a header from the given HTTP operation into the
provider headers under a new name
|
void |
setRecordIDGenerator(org.archive.uid.RecordIDGenerator generator) |
protected void |
setupPool(AtomicInteger serialNo)
Set up pool of files.
|
void |
setWriteMetadata(boolean writeMetadata) |
void |
setWriteRequests(boolean writeRequests) |
void |
setWriteRevisitForIdenticalDigests(boolean writeRevisits) |
void |
setWriteRevisitForNotModified(boolean writeRevisits) |
protected org.json.JSONObject |
toCheckpointJson()
Return a JSONObject of current stat that can be consulted
on recovery to restore necessary values.
|
protected void |
updateMetadataAfterWrite(CrawlURI curi,
org.archive.io.warc.WARCWriter writer,
long startPosition) |
protected ProcessResult |
write(String lowerCaseScheme,
CrawlURI curi) |
protected void |
writeDnsRecords(CrawlURI curi,
org.archive.io.warc.WARCWriter w,
URI baseid,
String timestamp) |
protected URI |
writeFtpControlConversation(org.archive.io.warc.WARCWriter w,
String timestamp,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord headers,
String controlConversation) |
protected void |
writeFtpRecords(org.archive.io.warc.WARCWriter w,
CrawlURI curi,
URI baseid,
String timestamp) |
protected void |
writeHttpRecords(CrawlURI curi,
org.archive.io.warc.WARCWriter w,
URI baseid,
String timestamp) |
protected URI |
writeMetadata(org.archive.io.warc.WARCWriter w,
String timestamp,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeRequest(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeResource(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeResponse(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord suppliedFields) |
protected URI |
writeRevisitDigest(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeRevisitDigest(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord namedFields,
long contentLength) |
protected URI |
writeRevisitNotModified(org.archive.io.warc.WARCWriter w,
String timestamp,
URI baseid,
CrawlURI puri,
org.archive.util.anvl.ANVLRecord namedFields) |
protected URI |
writeRevisitUriAgnosticDigest(org.archive.io.warc.WARCWriter w,
String timestamp,
String mimetype,
URI baseid,
CrawlURI curi,
org.archive.util.anvl.ANVLRecord headers) |
protected void |
writeWhoisRecords(org.archive.io.warc.WARCWriter w,
CrawlURI curi,
URI baseid,
String timestamp) |
calcOutputDirs, checkBytesWritten, copyForwardWriteTagIfDupe, doCheckpoint, getCompress, getDirectory, getFrequentFlushes, getHostAddress, getMaxFileSizeBytes, getMaxTotalBytesToWrite, getMaxWaitForIdleMs, getMetadataProvider, getPool, getPoolMaxActive, getPrefix, getSerialNo, getServerCache, getSkipIdenticalDigests, getStartNewFilesOnCheckpoint, getStorePaths, getTemplate, getTotalBytesWritten, getWriteBufferSize, innerProcess, innerRejectProcess, setCompress, setDirectory, setFrequentFlushes, setMaxFileSizeBytes, setMaxTotalBytesToWrite, setMaxWaitForIdleMs, setMetadataProvider, setPool, setPoolMaxActive, setPrefix, setServerCache, setSkipIdenticalDigests, setStartNewFilesOnCheckpoint, setStorePaths, setTemplate, setTotalBytesWritten, setWriteBufferSize, shouldProcess, shouldWrite, start, stop
finishCheckpoint, flattenVia, getBeanName, getEnabled, getKeyedProperties, getRecordedSize, getShouldProcessRule, getURICount, hasHttpAuthenticationCredential, isRunning, isSuccess, process, setBeanName, setEnabled, setRecoveryCheckpoint, setShouldProcessRule, startCheckpoint
clone, equals, finalize, getClass, hashCode, notify, notifyAll, toString, wait, wait, wait
calcOutputDirs, getCompress, getFrequentFlushes, getMaxFileSizeBytes, getPrefix, getTemplate, getWriteBufferSize
finishCheckpoint, setRecoveryCheckpoint, startCheckpoint
protected org.archive.uid.RecordIDGenerator generator
public long getDefaultMaxFileSize()
getDefaultMaxFileSize
in class WriterPoolProcessor
public List<ConfigPath> getDefaultStorePaths()
getDefaultStorePaths
in class WriterPoolProcessor
public boolean getWriteRequests()
public void setWriteRequests(boolean writeRequests)
public boolean getWriteMetadata()
public void setWriteMetadata(boolean writeMetadata)
public boolean getWriteRevisitForIdenticalDigests()
public void setWriteRevisitForIdenticalDigests(boolean writeRevisits)
public boolean getWriteRevisitForNotModified()
public void setWriteRevisitForNotModified(boolean writeRevisits)
public org.archive.uid.RecordIDGenerator getRecordIDGenerator()
getRecordIDGenerator
in interface org.archive.io.warc.WARCWriterPoolSettings
public void setRecordIDGenerator(org.archive.uid.RecordIDGenerator generator)
protected void setupPool(AtomicInteger serialNo)
WriterPoolProcessor
setupPool
in class WriterPoolProcessor
protected ProcessResult innerProcessResult(CrawlURI puri)
innerProcessResult
in class WriterPoolProcessor
curi
- CrawlURI to process.protected ProcessResult write(String lowerCaseScheme, CrawlURI curi) throws IOException
IOException
protected void updateMetadataAfterWrite(CrawlURI curi, org.archive.io.warc.WARCWriter writer, long startPosition)
protected void writeDnsRecords(CrawlURI curi, org.archive.io.warc.WARCWriter w, URI baseid, String timestamp) throws IOException
IOException
protected void writeWhoisRecords(org.archive.io.warc.WARCWriter w, CrawlURI curi, URI baseid, String timestamp) throws IOException
IOException
protected void writeHttpRecords(CrawlURI curi, org.archive.io.warc.WARCWriter w, URI baseid, String timestamp) throws IOException
IOException
protected void writeFtpRecords(org.archive.io.warc.WARCWriter w, CrawlURI curi, URI baseid, String timestamp) throws IOException
IOException
protected URI writeFtpControlConversation(org.archive.io.warc.WARCWriter w, String timestamp, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord headers, String controlConversation) throws IOException
IOException
protected URI writeRequest(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeResponse(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord suppliedFields) throws IOException
IOException
protected URI writeResource(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeRevisitDigest(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI writeRevisitDigest(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields, long contentLength) throws IOException
IOException
protected URI writeRevisitUriAgnosticDigest(org.archive.io.warc.WARCWriter w, String timestamp, String mimetype, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord headers) throws IOException
IOException
protected URI writeRevisitNotModified(org.archive.io.warc.WARCWriter w, String timestamp, URI baseid, CrawlURI puri, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected void saveHeader(String origName, org.apache.commons.httpclient.HttpMethod method, org.archive.util.anvl.ANVLRecord headers, String newName)
origName
- header name to get if presentmethod
- http operation containing headersprotected URI writeMetadata(org.archive.io.warc.WARCWriter w, String timestamp, URI baseid, CrawlURI curi, org.archive.util.anvl.ANVLRecord namedFields) throws IOException
IOException
protected URI getRecordID() throws IOException
IOException
protected URI qualifyRecordID(URI base, String key, String value) throws IOException
IOException
public List<String> getMetadata()
getMetadata
in interface org.archive.io.WriterPoolSettings
getMetadata
in class WriterPoolProcessor
protected void addIfNotBlank(org.archive.util.anvl.ANVLRecord record, String label, String value)
protected org.json.JSONObject toCheckpointJson() throws org.json.JSONException
Processor
toCheckpointJson
in class WriterPoolProcessor
org.json.JSONException
protected void fromCheckpointJson(org.json.JSONObject json) throws org.json.JSONException
Processor
fromCheckpointJson
in class WriterPoolProcessor
json
- JSONObjectorg.json.JSONException
Copyright © 2003-2014 Internet Archive. All Rights Reserved.