Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGES.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
1.1.7
-----
* [Store origin-code of ARC file header](https://github.com/iipc/webarchive-commons/pull/52/)

1.1.6
-----
* [Handle empty String argument in CharsetDetector.trimAttrValue](https://github.com/iipc/webarchive-commons/pull/49)
Expand Down
7 changes: 6 additions & 1 deletion src/main/java/org/archive/format/ArchiveFileConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,11 @@ public interface ArchiveFileConstants {
* Key for the Archive File version field.
*/
public static final String VERSION_FIELD_KEY = "version";

/**
* Key for the Archive File origin-code field. This value is often hard-coded, so use with care.
*/
public static final String ORIGIN_FIELD_KEY = "origin";

/**
* Key for the Archive File length field.
Expand Down Expand Up @@ -80,7 +85,7 @@ public interface ArchiveFileConstants {
* Key for the Archive Record absolute offset into Archive file.
*/
public static final String ABSOLUTE_OFFSET_KEY = "absolute-offset";

public static final String READER_IDENTIFIER_FIELD_KEY =
"reader-identifier";

Expand Down
2 changes: 1 addition & 1 deletion src/main/java/org/archive/format/arc/ARCConstants.java
Original file line number Diff line number Diff line change
Expand Up @@ -196,7 +196,7 @@ public interface ARCConstants extends ArchiveFileConstants {
.asList(new String[] { URL_FIELD_KEY, IP_HEADER_FIELD_KEY,
DATE_FIELD_KEY, MIMETYPE_FIELD_KEY,
LENGTH_FIELD_KEY, VERSION_FIELD_KEY,
ABSOLUTE_OFFSET_KEY });
ORIGIN_FIELD_KEY, ABSOLUTE_OFFSET_KEY });

/**
* Minimum possible record length.
Expand Down
13 changes: 9 additions & 4 deletions src/main/java/org/archive/io/arc/ARCRecord.java
Original file line number Diff line number Diff line change
Expand Up @@ -200,7 +200,7 @@ public ARCRecord(InputStream in, ArchiveRecordHeader metaData,
public ARCRecord(InputStream in, final String identifier,
final long offset, boolean digest, boolean strict,
final boolean parseHttpHeaders,
final boolean isAlignedOnFirstRecord, String version)
final boolean isAlignedOnFirstRecord, String version)
throws IOException {
super(in, null, 0, digest, strict);
setHeader(parseHeaders(in, identifier, offset, strict, isAlignedOnFirstRecord, version));
Expand Down Expand Up @@ -243,6 +243,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
getTokenizedHeaderLine(in, firstLineValues);

int bodyOffset = 0;
String origin = "";
if (offset == 0 && isAlignedOnFirstRecord) {
// If offset is zero and we were aligned at first record on
// creation (See #alignedOnFirstRecord for more on this), then no
Expand All @@ -263,6 +264,7 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
bodyOffset += getTokenizedHeaderLine(in, secondLineValues);
version = ((String)secondLineValues.get(0) +
"." + (String)secondLineValues.get(1));
origin = (String)secondLineValues.get(2);

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it safe to assume there is always an Origin string at position 2? Or do we need to check secondLineValues.size() > 2?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know if it is safe to assume it is there, but I do know that that field will frequently contain erroneous information.

I just had a look at our own ARCs and they all have InternetArchive set for this "origin". Seems webarchive-commons (and thus Heritrix) ARCWriter just hardcodes that value: https://github.com/jrwiebe/webarchive-commons/blob/master/src/main/java/org/archive/io/arc/ARCWriter.java#L272

Given how prevalent this abuse is by now, I think it is safe to say that this field has zero informational value.

// Just read over the 3rd line. We used to parse it and use
// values found here but now we just hardcode them to avoid
// having to read this 3rd line even for random arc file accesses.
Expand All @@ -271,7 +273,8 @@ private ArchiveRecordHeader parseHeaders(final InputStream in,
}
setBodyOffset(bodyOffset);

return computeMetaData(this.headerFieldNameKeys, firstLineValues, version, offset, identifier);
return computeMetaData(this.headerFieldNameKeys, firstLineValues,
version, origin, offset, identifier);
}

/**
Expand Down Expand Up @@ -362,7 +365,8 @@ private int getTokenizedHeaderLine(final InputStream stream,
* @exception IOException If no. of keys doesn't match no. of values.
*/
private ARCRecordMetaData computeMetaData(List<String> keys,
List<String> values, String v, long offset, final String identifier)
List<String> values, String v, String origin,
long offset, final String identifier)
throws IOException {
if (keys.size() != values.size()) {
List<String> originalValues = values;
Expand Down Expand Up @@ -423,6 +427,7 @@ private ARCRecordMetaData computeMetaData(List<String> keys,
}

headerFields.put(VERSION_FIELD_KEY, v);
headerFields.put(ORIGIN_FIELD_KEY, origin);
headerFields.put(ABSOLUTE_OFFSET_KEY, new Long(offset));

return new ARCRecordMetaData(identifier, headerFields);
Expand Down Expand Up @@ -832,4 +837,4 @@ protected String getDigest4Cdx(ArchiveRecordHeader h) {
}
return (result != null) ? result: super.getDigest4Cdx(h);
}
}
}
9 changes: 8 additions & 1 deletion src/main/java/org/archive/io/arc/ARCRecordMetaData.java
Original file line number Diff line number Diff line change
Expand Up @@ -168,6 +168,13 @@ public String getVersion() {
return (String)this.headerFields.get(VERSION_FIELD_KEY);
}

/**
* @return Arcfile origin code.
*/
public String getOrigin() {
return (String)this.headerFields.get(ORIGIN_FIELD_KEY);
}

/**
* @return Offset into arcfile at which this record begins.
*/
Expand Down Expand Up @@ -264,4 +271,4 @@ public int getContentBegin() {
protected void setContentBegin(final int offset) {
this.contentBegin = offset;
}
}
}